Diff 181794

lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

Show First 20 Lines • Show All 135 Lines • ▼ Show 20 Lines	class VectorLegalizer {
SDValue ExpandFSUB(SDValue Op);		SDValue ExpandFSUB(SDValue Op);
SDValue ExpandBITREVERSE(SDValue Op);		SDValue ExpandBITREVERSE(SDValue Op);
SDValue ExpandCTPOP(SDValue Op);		SDValue ExpandCTPOP(SDValue Op);
SDValue ExpandCTLZ(SDValue Op);		SDValue ExpandCTLZ(SDValue Op);
SDValue ExpandCTTZ(SDValue Op);		SDValue ExpandCTTZ(SDValue Op);
SDValue ExpandFunnelShift(SDValue Op);		SDValue ExpandFunnelShift(SDValue Op);
SDValue ExpandROT(SDValue Op);		SDValue ExpandROT(SDValue Op);
SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);		SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
		SDValue ExpandAddSubSat(SDValue Op);
SDValue ExpandStrictFPOp(SDValue Op);		SDValue ExpandStrictFPOp(SDValue Op);

/// Implements vector promotion.		/// Implements vector promotion.
///		///
/// This is essentially just bitcasting the operands to a different type and		/// This is essentially just bitcasting the operands to a different type and
/// bitcasting the result back to the original type.		/// bitcasting the result back to the original type.
SDValue Promote(SDValue Op);		SDValue Promote(SDValue Op);

▲ Show 20 Lines • Show All 179 Lines • ▼ Show 20 Lines	case ISD::STRICT_FTRUNC:
// These pseudo-ops get legalized as if they were their non-strict		// These pseudo-ops get legalized as if they were their non-strict
// equivalent. For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT		// equivalent. For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
// is also legal, but if ISD::FSQRT requires expansion then so does		// is also legal, but if ISD::FSQRT requires expansion then so does
// ISD::STRICT_FSQRT.		// ISD::STRICT_FSQRT.
Action = TLI.getStrictFPOperationAction(Node->getOpcode(),		Action = TLI.getStrictFPOperationAction(Node->getOpcode(),
Node->getValueType(0));		Node->getValueType(0));
break;		break;
case ISD::ADD:		case ISD::ADD:
case ISD::SUB:		case ISD::SUB:
		RKSimonUnsubmitted Done Reply Inline Actions Add ISD::USUBSAT et al here as well? RKSimon: Add ISD::USUBSAT et al here as well?
		nikicAuthorUnsubmitted Done Reply Inline Actions It's already present a bit lower (L413). nikic: It's already present a bit lower (L413).
case ISD::MUL:		case ISD::MUL:
case ISD::MULHS:		case ISD::MULHS:
case ISD::MULHU:		case ISD::MULHU:
case ISD::SDIV:		case ISD::SDIV:
case ISD::UDIV:		case ISD::UDIV:
case ISD::SREM:		case ISD::SREM:
case ISD::UREM:		case ISD::UREM:
case ISD::SDIVREM:		case ISD::SDIVREM:
▲ Show 20 Lines • Show All 426 Lines • ▼ Show 20 Lines	SDValue VectorLegalizer::Expand(SDValue Op) {
case ISD::FSHR:		case ISD::FSHR:
return ExpandFunnelShift(Op);		return ExpandFunnelShift(Op);
case ISD::ROTL:		case ISD::ROTL:
case ISD::ROTR:		case ISD::ROTR:
return ExpandROT(Op);		return ExpandROT(Op);
case ISD::FMINNUM:		case ISD::FMINNUM:
case ISD::FMAXNUM:		case ISD::FMAXNUM:
return ExpandFMINNUM_FMAXNUM(Op);		return ExpandFMINNUM_FMAXNUM(Op);
		case ISD::USUBSAT:
		case ISD::SSUBSAT:
		case ISD::UADDSAT:
		case ISD::SADDSAT:
		return ExpandAddSubSat(Op);
case ISD::STRICT_FADD:		case ISD::STRICT_FADD:
case ISD::STRICT_FSUB:		case ISD::STRICT_FSUB:
case ISD::STRICT_FMUL:		case ISD::STRICT_FMUL:
case ISD::STRICT_FDIV:		case ISD::STRICT_FDIV:
case ISD::STRICT_FREM:		case ISD::STRICT_FREM:
case ISD::STRICT_FSQRT:		case ISD::STRICT_FSQRT:
case ISD::STRICT_FMA:		case ISD::STRICT_FMA:
case ISD::STRICT_FPOW:		case ISD::STRICT_FPOW:
▲ Show 20 Lines • Show All 413 Lines • ▼ Show 20 Lines
}		}

SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {		SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {
if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG))		if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG))
return Expanded;		return Expanded;
return DAG.UnrollVectorOp(Op.getNode());		return DAG.UnrollVectorOp(Op.getNode());
}		}

		SDValue VectorLegalizer::ExpandAddSubSat(SDValue Op) {
		if (SDValue Expanded = TLI.expandAddSubSat(Op.getNode(), DAG))
		return Expanded;
		return DAG.UnrollVectorOp(Op.getNode());
		}

SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) {		SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) {
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
EVT EltVT = VT.getVectorElementType();		EVT EltVT = VT.getVectorElementType();
unsigned NumElems = VT.getVectorNumElements();		unsigned NumElems = VT.getVectorNumElements();
unsigned NumOpers = Op.getNumOperands();		unsigned NumOpers = Op.getNumOperands();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT ValueVTs[] = {EltVT, MVT::Other};		EVT ValueVTs[] = {EltVT, MVT::Other};
SDValue Chain = Op.getOperand(0);		SDValue Chain = Op.getOperand(0);
▲ Show 20 Lines • Show All 69 Lines • Show Last 20 Lines

lib/CodeGen/SelectionDAG/TargetLowering.cpp

Show First 20 Lines • Show All 5,268 Lines • ▼ Show 20 Lines	if (C->isNullValue() && CC == ISD::SETEQ) {
SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext);		SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext);
SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz,		SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz,
DAG.getConstant(Log2b, dl, MVT::i32));		DAG.getConstant(Log2b, dl, MVT::i32));
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);		return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);
}		}
}		}
return SDValue();		return SDValue();
}		}

SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {		SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
		RKSimonUnsubmitted Done Reply Inline Actions Really pedantic, but this function name is massive - why not just TargetLowering::expandAddSubSat ? RKSimon: Really pedantic, but this function name is massive - why not just TargetLowering…
		nikicAuthorUnsubmitted Done Reply Inline Actions Agreed, I've renamed the method. nikic: Agreed, I've renamed the method.
		RKSimonUnsubmitted Done Reply Inline Actions Thanks - if you can, please pull this out and commit this as a NFC straightaway. RKSimon: Thanks - if you can, please pull this out and commit this as a NFC straightaway.
unsigned Opcode = Node->getOpcode();		unsigned Opcode = Node->getOpcode();
		SDValue LHS = Node->getOperand(0);
		SDValue RHS = Node->getOperand(1);
		EVT VT = LHS.getValueType();
		SDLoc dl(Node);

		// usub.sat(a, b) -> umax(a, b) - b
		if (Opcode == ISD::USUBSAT && isOperationLegalOrCustom(ISD::UMAX, VT)) {
		SDValue Max = DAG.getNode(ISD::UMAX, dl, VT, LHS, RHS);
		RKSimonUnsubmitted Done Reply Inline Actions isOperationLegalOrCustom? RKSimon: isOperationLegalOrCustom?
		nikicAuthorUnsubmitted Done Reply Inline Actions Extra diff allowing custom: https://gist.github.com/nikic/4c46634cec8f319e687c6b5cb0496648 This is presumably better than scalarizing, but I was wondering if a more explicit expansion would work better? As another variant, this is what happens if I just fall through to the USUBO+SELECT expansion: https://gist.github.com/nikic/d989121f7f9898437a1255548c148904 nikic: Extra diff allowing custom: https://gist.github.com/nikic/4c46634cec8f319e687c6b5cb0496648…
		RKSimonUnsubmitted Done Reply Inline Actions Allowing legalorcustom looks OK to me - we can tweak per-target codegen in future commits if its useful. Also, should we try to do this on scalars as well before defaulting to add/sub overflow? For instance AMDGPU i32 for instance should benefit. RKSimon: Allowing legalorcustom looks OK to me - we can tweak per-target codegen in future commits if…
		nikicAuthorUnsubmitted Done Reply Inline Actions Okay, I did both changes. I agree that preferring this expansion makes sense even for scalar. I've also dropped the changes in X86ISelLowering. Now that custom is allowed here, they are no longer necessary. We'll just expand to a wide UMAX here that will get split (rather than the USUBSAT getting split and then each half expanded to UMAX). nikic: Okay, I did both changes. I agree that preferring this expansion makes sense even for scalar.
		return DAG.getNode(ISD::SUB, dl, VT, Max, RHS);
		}

		if (VT.isVector()) {
		// TODO: Consider not scalarizing here.
		return SDValue();
		}

unsigned OverflowOp;		unsigned OverflowOp;
switch (Opcode) {		switch (Opcode) {
case ISD::SADDSAT:		case ISD::SADDSAT:
OverflowOp = ISD::SADDO;		OverflowOp = ISD::SADDO;
break;		break;
case ISD::UADDSAT:		case ISD::UADDSAT:
OverflowOp = ISD::UADDO;		OverflowOp = ISD::UADDO;
break;		break;
case ISD::SSUBSAT:		case ISD::SSUBSAT:
OverflowOp = ISD::SSUBO;		OverflowOp = ISD::SSUBO;
break;		break;
case ISD::USUBSAT:		case ISD::USUBSAT:
OverflowOp = ISD::USUBO;		OverflowOp = ISD::USUBO;
break;		break;
default:		default:
llvm_unreachable("Expected method to receive signed or unsigned saturation "		llvm_unreachable("Expected method to receive signed or unsigned saturation "
"addition or subtraction node.");		"addition or subtraction node.");
}		}
assert(Node->getNumOperands() == 2 && "Expected node to have 2 operands.");

SDLoc dl(Node);
SDValue LHS = Node->getOperand(0);
SDValue RHS = Node->getOperand(1);
assert(LHS.getValueType().isScalarInteger() &&		assert(LHS.getValueType().isScalarInteger() &&
"Expected operands to be integers. Vector of int arguments should "		"Expected operands to be integers. Vector of int arguments should "
"already be unrolled.");		"already be unrolled.");
assert(RHS.getValueType().isScalarInteger() &&		assert(RHS.getValueType().isScalarInteger() &&
"Expected operands to be integers. Vector of int arguments should "		"Expected operands to be integers. Vector of int arguments should "
"already be unrolled.");		"already be unrolled.");
assert(LHS.getValueType() == RHS.getValueType() &&		assert(LHS.getValueType() == RHS.getValueType() &&
"Expected both operands to be the same type");		"Expected both operands to be the same type");
▲ Show 20 Lines • Show All 182 Lines • Show Last 20 Lines

lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 1,774 Lines • ▼ Show 20 Lines	static const CostTblEntry AVX512CostTbl[] = {
{ ISD::BITREVERSE, MVT::v8i64, 36 },		{ ISD::BITREVERSE, MVT::v8i64, 36 },
{ ISD::BITREVERSE, MVT::v16i32, 24 },		{ ISD::BITREVERSE, MVT::v16i32, 24 },
{ ISD::CTLZ, MVT::v8i64, 29 },		{ ISD::CTLZ, MVT::v8i64, 29 },
{ ISD::CTLZ, MVT::v16i32, 35 },		{ ISD::CTLZ, MVT::v16i32, 35 },
{ ISD::CTPOP, MVT::v8i64, 16 },		{ ISD::CTPOP, MVT::v8i64, 16 },
{ ISD::CTPOP, MVT::v16i32, 24 },		{ ISD::CTPOP, MVT::v16i32, 24 },
{ ISD::CTTZ, MVT::v8i64, 20 },		{ ISD::CTTZ, MVT::v8i64, 20 },
{ ISD::CTTZ, MVT::v16i32, 28 },		{ ISD::CTTZ, MVT::v16i32, 28 },
		{ ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
		{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
		{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
		{ ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
};		};
static const CostTblEntry XOPCostTbl[] = {		static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },		{ ISD::BITREVERSE, MVT::v4i64, 4 },
{ ISD::BITREVERSE, MVT::v8i32, 4 },		{ ISD::BITREVERSE, MVT::v8i32, 4 },
{ ISD::BITREVERSE, MVT::v16i16, 4 },		{ ISD::BITREVERSE, MVT::v16i16, 4 },
{ ISD::BITREVERSE, MVT::v32i8, 4 },		{ ISD::BITREVERSE, MVT::v32i8, 4 },
{ ISD::BITREVERSE, MVT::v2i64, 1 },		{ ISD::BITREVERSE, MVT::v2i64, 1 },
{ ISD::BITREVERSE, MVT::v4i32, 1 },		{ ISD::BITREVERSE, MVT::v4i32, 1 },
Show All 27 Lines	static const CostTblEntry AVX2CostTbl[] = {
{ ISD::SADDSAT, MVT::v16i16, 1 },		{ ISD::SADDSAT, MVT::v16i16, 1 },
{ ISD::SADDSAT, MVT::v32i8, 1 },		{ ISD::SADDSAT, MVT::v32i8, 1 },
{ ISD::SSUBSAT, MVT::v16i16, 1 },		{ ISD::SSUBSAT, MVT::v16i16, 1 },
{ ISD::SSUBSAT, MVT::v32i8, 1 },		{ ISD::SSUBSAT, MVT::v32i8, 1 },
{ ISD::UADDSAT, MVT::v16i16, 1 },		{ ISD::UADDSAT, MVT::v16i16, 1 },
{ ISD::UADDSAT, MVT::v32i8, 1 },		{ ISD::UADDSAT, MVT::v32i8, 1 },
{ ISD::USUBSAT, MVT::v16i16, 1 },		{ ISD::USUBSAT, MVT::v16i16, 1 },
{ ISD::USUBSAT, MVT::v32i8, 1 },		{ ISD::USUBSAT, MVT::v32i8, 1 },
		{ ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
{ ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
};		};
static const CostTblEntry AVX1CostTbl[] = {		static const CostTblEntry AVX1CostTbl[] = {
Show All 19 Lines	static const CostTblEntry AVX1CostTbl[] = {
{ ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
		{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
};		};
static const CostTblEntry GLMCostTbl[] = {		static const CostTblEntry GLMCostTbl[] = {
{ ISD::FSQRT, MVT::f32, 19 }, // sqrtss		{ ISD::FSQRT, MVT::f32, 19 }, // sqrtss
{ ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps		{ ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
{ ISD::FSQRT, MVT::f64, 34 }, // sqrtsd		{ ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
{ ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd		{ ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
};		};
static const CostTblEntry SLMCostTbl[] = {		static const CostTblEntry SLMCostTbl[] = {
{ ISD::FSQRT, MVT::f32, 20 }, // sqrtss		{ ISD::FSQRT, MVT::f32, 20 }, // sqrtss
{ ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps		{ ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
{ ISD::FSQRT, MVT::f64, 35 }, // sqrtsd		{ ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
{ ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd		{ ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
};		};
static const CostTblEntry SSE42CostTbl[] = {		static const CostTblEntry SSE42CostTbl[] = {
		{ ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/		{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/		{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
};		};
static const CostTblEntry SSSE3CostTbl[] = {		static const CostTblEntry SSSE3CostTbl[] = {
{ ISD::BITREVERSE, MVT::v2i64, 5 },		{ ISD::BITREVERSE, MVT::v2i64, 5 },
{ ISD::BITREVERSE, MVT::v4i32, 5 },		{ ISD::BITREVERSE, MVT::v4i32, 5 },
{ ISD::BITREVERSE, MVT::v8i16, 5 },		{ ISD::BITREVERSE, MVT::v8i16, 5 },
{ ISD::BITREVERSE, MVT::v16i8, 5 },		{ ISD::BITREVERSE, MVT::v16i8, 5 },
▲ Show 20 Lines • Show All 1,374 Lines • Show Last 20 Lines

test/Analysis/CostModel/X86/arith-usat.ll

	Show First 20 Lines • Show All 244 Lines • ▼ Show 20 Lines
	declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>)			declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>)

	declare i8 @llvm.usub.sat.i8(i8, i8)			declare i8 @llvm.usub.sat.i8(i8, i8)
	declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>)			declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>)
	declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)			declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
	declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)			declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)

	define i32 @sub(i32 %arg) {			define i32 @sub(i32 %arg) {
	; SSE-LABEL: 'sub'			; SSSE3-LABEL: 'sub'
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
				;
				; SSE42-LABEL: 'sub'
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX1-LABEL: 'sub'			; AVX1-LABEL: 'sub'
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX2-LABEL: 'sub'			; AVX2-LABEL: 'sub'
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX512F-LABEL: 'sub'			; AVX512F-LABEL: 'sub'
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX512BW-LABEL: 'sub'			; AVX512BW-LABEL: 'sub'
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX512DQ-LABEL: 'sub'			; AVX512DQ-LABEL: 'sub'
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; SLM-LABEL: 'sub'			; SLM-LABEL: 'sub'
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; GLM-LABEL: 'sub'			; GLM-LABEL: 'sub'
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; BTVER2-LABEL: 'sub'			; BTVER2-LABEL: 'sub'
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	Show All 24 Lines

test/CodeGen/X86/usub_sat.ll

	Show First 20 Lines • Show All 106 Lines • ▼ Show 20 Lines
	; X86-NEXT: movl %edi, (%eax)			; X86-NEXT: movl %edi, (%eax)
	; X86-NEXT: popl %esi			; X86-NEXT: popl %esi
	; X86-NEXT: popl %edi			; X86-NEXT: popl %edi
	; X86-NEXT: popl %ebx			; X86-NEXT: popl %ebx
	; X86-NEXT: retl $4			; X86-NEXT: retl $4
	;			;
	; X64-LABEL: vec:			; X64-LABEL: vec:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]			; X64-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
	; X64-NEXT: movd %xmm2, %eax			; X64-NEXT: movdqa %xmm1, %xmm3
	; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]			; X64-NEXT: pxor %xmm2, %xmm3
	; X64-NEXT: movd %xmm2, %ecx			; X64-NEXT: pxor %xmm0, %xmm2
	; X64-NEXT: xorl %edx, %edx			; X64-NEXT: pcmpgtd %xmm3, %xmm2
	; X64-NEXT: subl %eax, %ecx			; X64-NEXT: pand %xmm2, %xmm0
	; X64-NEXT: cmovbl %edx, %ecx			; X64-NEXT: pandn %xmm1, %xmm2
	; X64-NEXT: movd %ecx, %xmm2			; X64-NEXT: por %xmm2, %xmm0
	; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]			; X64-NEXT: psubd %xmm1, %xmm0
	; X64-NEXT: movd %xmm3, %eax
	; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
	; X64-NEXT: movd %xmm3, %ecx
	; X64-NEXT: subl %eax, %ecx
	; X64-NEXT: cmovbl %edx, %ecx
	; X64-NEXT: movd %ecx, %xmm3
	; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
	; X64-NEXT: movd %xmm1, %eax
	; X64-NEXT: movd %xmm0, %ecx
	; X64-NEXT: subl %eax, %ecx
	; X64-NEXT: cmovbl %edx, %ecx
	; X64-NEXT: movd %ecx, %xmm2
	; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
	; X64-NEXT: movd %xmm1, %eax
	; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; X64-NEXT: movd %xmm0, %ecx
	; X64-NEXT: subl %eax, %ecx
	; X64-NEXT: cmovbl %edx, %ecx
	; X64-NEXT: movd %ecx, %xmm0
	; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
	; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
	; X64-NEXT: movdqa %xmm2, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%tmp = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);			%tmp = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
	ret <4 x i32> %tmp;			ret <4 x i32> %tmp;
	}			}

test/CodeGen/X86/usub_sat_vec.ll

	Show First 20 Lines • Show All 628 Lines • ▼ Show 20 Lines
	}			}

	; Expanded			; Expanded

	define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {			define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
	; SSE2-LABEL: v2i32:			; SSE2-LABEL: v2i32:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: psllq $32, %xmm1			; SSE2-NEXT: psllq $32, %xmm1
	; SSE2-NEXT: movq %xmm1, %rax			; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
				; SSE2-NEXT: movdqa %xmm1, %xmm3
				; SSE2-NEXT: pxor %xmm2, %xmm3
	; SSE2-NEXT: psllq $32, %xmm0			; SSE2-NEXT: psllq $32, %xmm0
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: pxor %xmm0, %xmm2
	; SSE2-NEXT: xorl %edx, %edx			; SSE2-NEXT: movdqa %xmm2, %xmm4
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSE2-NEXT: movq %rcx, %xmm2			; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; SSE2-NEXT: movq %xmm1, %rax			; SSE2-NEXT: pand %xmm5, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: por %xmm2, %xmm3
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pand %xmm3, %xmm0
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: pandn %xmm1, %xmm3
	; SSE2-NEXT: movq %rcx, %xmm0			; SSE2-NEXT: por %xmm3, %xmm0
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSE2-NEXT: psubq %xmm1, %xmm0
	; SSE2-NEXT: psrlq $32, %xmm2			; SSE2-NEXT: psrlq $32, %xmm0
	; SSE2-NEXT: movdqa %xmm2, %xmm0
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v2i32:			; SSSE3-LABEL: v2i32:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: psllq $32, %xmm1			; SSSE3-NEXT: psllq $32, %xmm1
	; SSSE3-NEXT: movq %xmm1, %rax			; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
				; SSSE3-NEXT: movdqa %xmm1, %xmm3
				; SSSE3-NEXT: pxor %xmm2, %xmm3
	; SSSE3-NEXT: psllq $32, %xmm0			; SSSE3-NEXT: psllq $32, %xmm0
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: pxor %xmm0, %xmm2
	; SSSE3-NEXT: xorl %edx, %edx			; SSSE3-NEXT: movdqa %xmm2, %xmm4
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSSE3-NEXT: movq %rcx, %xmm2			; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; SSSE3-NEXT: movq %xmm1, %rax			; SSSE3-NEXT: pand %xmm5, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: por %xmm2, %xmm3
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pand %xmm3, %xmm0
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: pandn %xmm1, %xmm3
	; SSSE3-NEXT: movq %rcx, %xmm0			; SSSE3-NEXT: por %xmm3, %xmm0
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSSE3-NEXT: psubq %xmm1, %xmm0
	; SSSE3-NEXT: psrlq $32, %xmm2			; SSSE3-NEXT: psrlq $32, %xmm0
	; SSSE3-NEXT: movdqa %xmm2, %xmm0
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v2i32:			; SSE41-LABEL: v2i32:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
				; SSE41-NEXT: movdqa %xmm0, %xmm2
	; SSE41-NEXT: psllq $32, %xmm1			; SSE41-NEXT: psllq $32, %xmm1
	; SSE41-NEXT: pextrq $1, %xmm1, %rax			; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
	; SSE41-NEXT: psllq $32, %xmm0			; SSE41-NEXT: movdqa %xmm1, %xmm3
	; SSE41-NEXT: pextrq $1, %xmm0, %rcx			; SSE41-NEXT: pxor %xmm0, %xmm3
	; SSE41-NEXT: xorl %edx, %edx			; SSE41-NEXT: psllq $32, %xmm2
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pxor %xmm2, %xmm0
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: movdqa %xmm0, %xmm4
	; SSE41-NEXT: movq %rcx, %xmm2			; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
	; SSE41-NEXT: movq %xmm1, %rax			; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSE41-NEXT: movq %xmm0, %rcx			; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: pand %xmm5, %xmm0
	; SSE41-NEXT: movq %rcx, %xmm0			; SSE41-NEXT: por %xmm4, %xmm0
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]			; SSE41-NEXT: movdqa %xmm1, %xmm3
	; SSE41-NEXT: psrlq $32, %xmm0			; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
				; SSE41-NEXT: psubq %xmm1, %xmm3
				; SSE41-NEXT: psrlq $32, %xmm3
				; SSE41-NEXT: movdqa %xmm3, %xmm0
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: v2i32:			; AVX1-LABEL: v2i32:
	; AVX: # %bb.0:			; AVX1: # %bb.0:
	; AVX-NEXT: vpsllq $32, %xmm1, %xmm1			; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
	; AVX-NEXT: vpextrq $1, %xmm1, %rax			; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
	; AVX-NEXT: vpsllq $32, %xmm0, %xmm0			; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
	; AVX-NEXT: vpextrq $1, %xmm0, %rcx			; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
	; AVX-NEXT: xorl %edx, %edx			; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
	; AVX-NEXT: subq %rax, %rcx			; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
	; AVX-NEXT: cmovbq %rdx, %rcx			; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
	; AVX-NEXT: vmovq %rcx, %xmm2			; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vmovq %xmm1, %rax			; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
	; AVX-NEXT: vmovq %xmm0, %rcx			; AVX1-NEXT: retq
	; AVX-NEXT: subq %rax, %rcx			;
	; AVX-NEXT: cmovbq %rdx, %rcx			; AVX2-LABEL: v2i32:
	; AVX-NEXT: vmovq %rcx, %xmm0			; AVX2: # %bb.0:
	; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]			; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
	; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0			; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
	; AVX-NEXT: retq			; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
				; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
				; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
				; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
				; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
				; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
				; AVX2-NEXT: retq
				;
				; AVX512-LABEL: v2i32:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
				; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
				; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
				; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
				; AVX512-NEXT: vpsrlq $32, %xmm0, %xmm0
				; AVX512-NEXT: retq
	%z = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)			%z = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
	ret <2 x i32> %z			ret <2 x i32> %z
	}			}

	define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {			define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
	; SSE2-LABEL: v4i32:			; SSE2-LABEL: v4i32:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]			; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
	; SSE2-NEXT: movd %xmm2, %eax			; SSE2-NEXT: movdqa %xmm1, %xmm3
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]			; SSE2-NEXT: pxor %xmm2, %xmm3
	; SSE2-NEXT: movd %xmm2, %ecx			; SSE2-NEXT: pxor %xmm0, %xmm2
	; SSE2-NEXT: xorl %edx, %edx			; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
	; SSE2-NEXT: subl %eax, %ecx			; SSE2-NEXT: pand %xmm2, %xmm0
	; SSE2-NEXT: cmovbl %edx, %ecx			; SSE2-NEXT: pandn %xmm1, %xmm2
	; SSE2-NEXT: movd %ecx, %xmm2			; SSE2-NEXT: por %xmm2, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]			; SSE2-NEXT: psubd %xmm1, %xmm0
	; SSE2-NEXT: movd %xmm3, %eax
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
	; SSE2-NEXT: movd %xmm3, %ecx
	; SSE2-NEXT: subl %eax, %ecx
	; SSE2-NEXT: cmovbl %edx, %ecx
	; SSE2-NEXT: movd %ecx, %xmm3
	; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
	; SSE2-NEXT: movd %xmm1, %eax
	; SSE2-NEXT: movd %xmm0, %ecx
	; SSE2-NEXT: subl %eax, %ecx
	; SSE2-NEXT: cmovbl %edx, %ecx
	; SSE2-NEXT: movd %ecx, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
	; SSE2-NEXT: movd %xmm1, %eax
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; SSE2-NEXT: movd %xmm0, %ecx
	; SSE2-NEXT: subl %eax, %ecx
	; SSE2-NEXT: cmovbl %edx, %ecx
	; SSE2-NEXT: movd %ecx, %xmm0
	; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
	; SSE2-NEXT: movdqa %xmm2, %xmm0
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v4i32:			; SSSE3-LABEL: v4i32:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]			; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
	; SSSE3-NEXT: movd %xmm2, %eax			; SSSE3-NEXT: movdqa %xmm1, %xmm3
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]			; SSSE3-NEXT: pxor %xmm2, %xmm3
	; SSSE3-NEXT: movd %xmm2, %ecx			; SSSE3-NEXT: pxor %xmm0, %xmm2
	; SSSE3-NEXT: xorl %edx, %edx			; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
	; SSSE3-NEXT: subl %eax, %ecx			; SSSE3-NEXT: pand %xmm2, %xmm0
	; SSSE3-NEXT: cmovbl %edx, %ecx			; SSSE3-NEXT: pandn %xmm1, %xmm2
	; SSSE3-NEXT: movd %ecx, %xmm2			; SSSE3-NEXT: por %xmm2, %xmm0
	; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]			; SSSE3-NEXT: psubd %xmm1, %xmm0
	; SSSE3-NEXT: movd %xmm3, %eax
	; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
	; SSSE3-NEXT: movd %xmm3, %ecx
	; SSSE3-NEXT: subl %eax, %ecx
	; SSSE3-NEXT: cmovbl %edx, %ecx
	; SSSE3-NEXT: movd %ecx, %xmm3
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
	; SSSE3-NEXT: movd %xmm1, %eax
	; SSSE3-NEXT: movd %xmm0, %ecx
	; SSSE3-NEXT: subl %eax, %ecx
	; SSSE3-NEXT: cmovbl %edx, %ecx
	; SSSE3-NEXT: movd %ecx, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
	; SSSE3-NEXT: movd %xmm1, %eax
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; SSSE3-NEXT: movd %xmm0, %ecx
	; SSSE3-NEXT: subl %eax, %ecx
	; SSSE3-NEXT: cmovbl %edx, %ecx
	; SSSE3-NEXT: movd %ecx, %xmm0
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
	; SSSE3-NEXT: movdqa %xmm2, %xmm0
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v4i32:			; SSE41-LABEL: v4i32:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: pextrd $1, %xmm1, %eax			; SSE41-NEXT: pmaxud %xmm1, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm0, %ecx			; SSE41-NEXT: psubd %xmm1, %xmm0
	; SSE41-NEXT: xorl %edx, %edx
	; SSE41-NEXT: subl %eax, %ecx
	; SSE41-NEXT: cmovbl %edx, %ecx
	; SSE41-NEXT: movd %xmm1, %eax
	; SSE41-NEXT: movd %xmm0, %esi
	; SSE41-NEXT: subl %eax, %esi
	; SSE41-NEXT: cmovbl %edx, %esi
	; SSE41-NEXT: movd %esi, %xmm2
	; SSE41-NEXT: pinsrd $1, %ecx, %xmm2
	; SSE41-NEXT: pextrd $2, %xmm1, %eax
	; SSE41-NEXT: pextrd $2, %xmm0, %ecx
	; SSE41-NEXT: subl %eax, %ecx
	; SSE41-NEXT: cmovbl %edx, %ecx
	; SSE41-NEXT: pinsrd $2, %ecx, %xmm2
	; SSE41-NEXT: pextrd $3, %xmm1, %eax
	; SSE41-NEXT: pextrd $3, %xmm0, %ecx
	; SSE41-NEXT: subl %eax, %ecx
	; SSE41-NEXT: cmovbl %edx, %ecx
	; SSE41-NEXT: pinsrd $3, %ecx, %xmm2
	; SSE41-NEXT: movdqa %xmm2, %xmm0
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: v4i32:			; AVX-LABEL: v4i32:
	; AVX: # %bb.0:			; AVX: # %bb.0:
	; AVX-NEXT: vpextrd $1, %xmm1, %eax			; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vpextrd $1, %xmm0, %ecx			; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
	; AVX-NEXT: xorl %edx, %edx
	; AVX-NEXT: subl %eax, %ecx
	; AVX-NEXT: cmovbl %edx, %ecx
	; AVX-NEXT: vmovd %xmm1, %eax
	; AVX-NEXT: vmovd %xmm0, %esi
	; AVX-NEXT: subl %eax, %esi
	; AVX-NEXT: cmovbl %edx, %esi
	; AVX-NEXT: vmovd %esi, %xmm2
	; AVX-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
	; AVX-NEXT: vpextrd $2, %xmm1, %eax
	; AVX-NEXT: vpextrd $2, %xmm0, %ecx
	; AVX-NEXT: subl %eax, %ecx
	; AVX-NEXT: cmovbl %edx, %ecx
	; AVX-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
	; AVX-NEXT: vpextrd $3, %xmm1, %eax
	; AVX-NEXT: vpextrd $3, %xmm0, %ecx
	; AVX-NEXT: subl %eax, %ecx
	; AVX-NEXT: cmovbl %edx, %ecx
	; AVX-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%z = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)			%z = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
	ret <4 x i32> %z			ret <4 x i32> %z
	}			}

	define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {			define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
	; SSE2-LABEL: v8i32:			; SSE2-LABEL: v8i32:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
				; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
				; SSE2-NEXT: movdqa %xmm2, %xmm6
				; SSE2-NEXT: pxor %xmm5, %xmm6
	; SSE2-NEXT: movdqa %xmm0, %xmm4			; SSE2-NEXT: movdqa %xmm0, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]			; SSE2-NEXT: pxor %xmm5, %xmm4
	; SSE2-NEXT: movd %xmm0, %ecx			; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,3]			; SSE2-NEXT: pand %xmm4, %xmm0
	; SSE2-NEXT: movd %xmm0, %edx			; SSE2-NEXT: pandn %xmm2, %xmm4
	; SSE2-NEXT: xorl %eax, %eax			; SSE2-NEXT: por %xmm0, %xmm4
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: psubd %xmm2, %xmm4
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: movdqa %xmm3, %xmm0
	; SSE2-NEXT: movd %edx, %xmm0			; SSE2-NEXT: pxor %xmm5, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]			; SSE2-NEXT: pxor %xmm1, %xmm5
	; SSE2-NEXT: movd %xmm5, %ecx			; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]			; SSE2-NEXT: pand %xmm5, %xmm1
	; SSE2-NEXT: movd %xmm5, %edx			; SSE2-NEXT: pandn %xmm3, %xmm5
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: por %xmm5, %xmm1
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: psubd %xmm3, %xmm1
	; SSE2-NEXT: movd %edx, %xmm5			; SSE2-NEXT: movdqa %xmm4, %xmm0
	; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
	; SSE2-NEXT: movd %xmm2, %ecx
	; SSE2-NEXT: movd %xmm4, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
	; SSE2-NEXT: movd %xmm2, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,3]
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
	; SSE2-NEXT: movd %xmm2, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
	; SSE2-NEXT: movd %xmm4, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
	; SSE2-NEXT: movd %xmm4, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm4
	; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
	; SSE2-NEXT: movd %xmm3, %ecx
	; SSE2-NEXT: movd %xmm1, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
	; SSE2-NEXT: movd %xmm3, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
	; SSE2-NEXT: movd %xmm1, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm1
	; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; SSE2-NEXT: movdqa %xmm2, %xmm1
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v8i32:			; SSSE3-LABEL: v8i32:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
				; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
				; SSSE3-NEXT: movdqa %xmm2, %xmm6
				; SSSE3-NEXT: pxor %xmm5, %xmm6
	; SSSE3-NEXT: movdqa %xmm0, %xmm4			; SSSE3-NEXT: movdqa %xmm0, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]			; SSSE3-NEXT: pxor %xmm5, %xmm4
	; SSSE3-NEXT: movd %xmm0, %ecx			; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,3]			; SSSE3-NEXT: pand %xmm4, %xmm0
	; SSSE3-NEXT: movd %xmm0, %edx			; SSSE3-NEXT: pandn %xmm2, %xmm4
	; SSSE3-NEXT: xorl %eax, %eax			; SSSE3-NEXT: por %xmm0, %xmm4
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: psubd %xmm2, %xmm4
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: movdqa %xmm3, %xmm0
	; SSSE3-NEXT: movd %edx, %xmm0			; SSSE3-NEXT: pxor %xmm5, %xmm0
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]			; SSSE3-NEXT: pxor %xmm1, %xmm5
	; SSSE3-NEXT: movd %xmm5, %ecx			; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]			; SSSE3-NEXT: pand %xmm5, %xmm1
	; SSSE3-NEXT: movd %xmm5, %edx			; SSSE3-NEXT: pandn %xmm3, %xmm5
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: por %xmm5, %xmm1
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: psubd %xmm3, %xmm1
	; SSSE3-NEXT: movd %edx, %xmm5			; SSSE3-NEXT: movdqa %xmm4, %xmm0
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
	; SSSE3-NEXT: movd %xmm2, %ecx
	; SSSE3-NEXT: movd %xmm4, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm0
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
	; SSSE3-NEXT: movd %xmm4, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
	; SSSE3-NEXT: movd %xmm4, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm4
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
	; SSSE3-NEXT: movd %xmm3, %ecx
	; SSSE3-NEXT: movd %xmm1, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
	; SSSE3-NEXT: movd %xmm3, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
	; SSSE3-NEXT: movd %xmm1, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm1
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; SSSE3-NEXT: movdqa %xmm2, %xmm1
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v8i32:			; SSE41-LABEL: v8i32:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: movdqa %xmm0, %xmm4			; SSE41-NEXT: pmaxud %xmm2, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm2, %ecx			; SSE41-NEXT: psubd %xmm2, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm0, %edx			; SSE41-NEXT: pmaxud %xmm3, %xmm1
	; SSE41-NEXT: xorl %eax, %eax			; SSE41-NEXT: psubd %xmm3, %xmm1
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: movd %xmm2, %ecx
	; SSE41-NEXT: movd %xmm0, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm0
	; SSE41-NEXT: pinsrd $1, %edx, %xmm0
	; SSE41-NEXT: pextrd $2, %xmm2, %ecx
	; SSE41-NEXT: pextrd $2, %xmm4, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm0
	; SSE41-NEXT: pextrd $3, %xmm2, %ecx
	; SSE41-NEXT: pextrd $3, %xmm4, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm3, %ecx
	; SSE41-NEXT: pextrd $1, %xmm1, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: movd %xmm3, %ecx
	; SSE41-NEXT: movd %xmm1, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm2
	; SSE41-NEXT: pinsrd $1, %edx, %xmm2
	; SSE41-NEXT: pextrd $2, %xmm3, %ecx
	; SSE41-NEXT: pextrd $2, %xmm1, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm2
	; SSE41-NEXT: pextrd $3, %xmm3, %ecx
	; SSE41-NEXT: pextrd $3, %xmm1, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm2
	; SSE41-NEXT: movdqa %xmm2, %xmm1
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: v8i32:			; AVX1-LABEL: v8i32:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2			; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpextrd $1, %xmm2, %ecx
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpextrd $1, %xmm3, %edx			; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm3
	; AVX1-NEXT: xorl %eax, %eax			; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
	; AVX1-NEXT: subl %ecx, %edx			; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: cmovbl %eax, %edx			; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vmovd %xmm2, %ecx
	; AVX1-NEXT: vmovd %xmm3, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm4
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX1-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm3, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm3, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
	; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
	; AVX1-NEXT: vpextrd $1, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vmovd %xmm1, %ecx
	; AVX1-NEXT: vmovd %xmm0, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm3
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm3, %xmm3
	; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
	; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm3, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: v8i32:			; AVX2-LABEL: v8i32:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2			; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: vpextrd $1, %xmm2, %ecx			; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
	; AVX2-NEXT: vpextrd $1, %xmm3, %edx
	; AVX2-NEXT: xorl %eax, %eax
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm2, %ecx
	; AVX2-NEXT: vmovd %xmm3, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm4
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX2-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm3, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX2-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm3, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
	; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
	; AVX2-NEXT: vpextrd $1, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm1, %ecx
	; AVX2-NEXT: vmovd %xmm0, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm3
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm3, %xmm3
	; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
	; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: v8i32:			; AVX512-LABEL: v8i32:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2			; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
	; AVX512-NEXT: vpextrd $1, %xmm2, %ecx			; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
	; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
	; AVX512-NEXT: vpextrd $1, %xmm3, %edx
	; AVX512-NEXT: xorl %eax, %eax
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm2, %ecx
	; AVX512-NEXT: vmovd %xmm3, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm4
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm3, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm3, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
	; AVX512-NEXT: vpextrd $1, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $1, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm1, %ecx
	; AVX512-NEXT: vmovd %xmm0, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm3
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm3, %xmm3
	; AVX512-NEXT: vpextrd $2, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
	; AVX512-NEXT: vpextrd $3, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm3, %xmm0
	; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%z = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)			%z = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
	ret <8 x i32> %z			ret <8 x i32> %z
	}			}

	define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {			define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
	; SSE2-LABEL: v16i32:			; SSE2-LABEL: v16i32:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: movdqa %xmm1, %xmm8			; SSE2-NEXT: movdqa %xmm1, %xmm8
	; SSE2-NEXT: movdqa %xmm0, %xmm1			; SSE2-NEXT: movdqa %xmm0, %xmm10
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,3]			; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
	; SSE2-NEXT: movd %xmm0, %ecx			; SSE2-NEXT: movdqa %xmm4, %xmm1
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]			; SSE2-NEXT: pxor %xmm9, %xmm1
	; SSE2-NEXT: movd %xmm0, %edx			; SSE2-NEXT: pxor %xmm9, %xmm0
	; SSE2-NEXT: xorl %eax, %eax			; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: pand %xmm0, %xmm10
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: pandn %xmm4, %xmm0
	; SSE2-NEXT: movd %edx, %xmm9			; SSE2-NEXT: por %xmm10, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]			; SSE2-NEXT: psubd %xmm4, %xmm0
	; SSE2-NEXT: movd %xmm0, %ecx			; SSE2-NEXT: movdqa %xmm5, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]			; SSE2-NEXT: pxor %xmm9, %xmm4
	; SSE2-NEXT: movd %xmm0, %edx			; SSE2-NEXT: movdqa %xmm8, %xmm1
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: pxor %xmm9, %xmm1
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
	; SSE2-NEXT: movd %edx, %xmm10			; SSE2-NEXT: pand %xmm1, %xmm8
	; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]			; SSE2-NEXT: pandn %xmm5, %xmm1
	; SSE2-NEXT: movd %xmm4, %ecx			; SSE2-NEXT: por %xmm8, %xmm1
	; SSE2-NEXT: movd %xmm1, %edx			; SSE2-NEXT: psubd %xmm5, %xmm1
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: movdqa %xmm6, %xmm5
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: pxor %xmm9, %xmm5
	; SSE2-NEXT: movd %edx, %xmm0			; SSE2-NEXT: movdqa %xmm2, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]			; SSE2-NEXT: pxor %xmm9, %xmm4
	; SSE2-NEXT: movd %xmm4, %ecx			; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]			; SSE2-NEXT: pand %xmm4, %xmm2
	; SSE2-NEXT: movd %xmm1, %edx			; SSE2-NEXT: pandn %xmm6, %xmm4
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: por %xmm2, %xmm4
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: psubd %xmm6, %xmm4
	; SSE2-NEXT: movd %edx, %xmm1			; SSE2-NEXT: movdqa %xmm7, %xmm2
	; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; SSE2-NEXT: pxor %xmm9, %xmm2
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]			; SSE2-NEXT: pxor %xmm3, %xmm9
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,3]			; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
	; SSE2-NEXT: movd %xmm1, %ecx			; SSE2-NEXT: pand %xmm9, %xmm3
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3]			; SSE2-NEXT: pandn %xmm7, %xmm9
	; SSE2-NEXT: movd %xmm1, %edx			; SSE2-NEXT: por %xmm9, %xmm3
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: psubd %xmm7, %xmm3
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm1
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
	; SSE2-NEXT: movd %xmm4, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1]
	; SSE2-NEXT: movd %xmm4, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm4
	; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
	; SSE2-NEXT: movd %xmm5, %ecx
	; SSE2-NEXT: movd %xmm8, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm1
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
	; SSE2-NEXT: movd %xmm5, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3]
	; SSE2-NEXT: movd %xmm5, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm5
	; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
	; SSE2-NEXT: movd %xmm4, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,1,2,3]
	; SSE2-NEXT: movd %xmm4, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
	; SSE2-NEXT: movd %xmm5, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
	; SSE2-NEXT: movd %xmm5, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm5
	; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
	; SSE2-NEXT: movd %xmm6, %ecx
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
	; SSE2-NEXT: movd %xmm6, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,1,2,3]
	; SSE2-NEXT: movd %xmm2, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
	; SSE2-NEXT: movd %xmm5, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
	; SSE2-NEXT: movd %xmm5, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm6
	; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
	; SSE2-NEXT: movd %xmm7, %ecx
	; SSE2-NEXT: movd %xmm3, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm5
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,2,3]
	; SSE2-NEXT: movd %xmm2, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3]
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
	; SSE2-NEXT: movdqa %xmm4, %xmm2			; SSE2-NEXT: movdqa %xmm4, %xmm2
	; SSE2-NEXT: movdqa %xmm5, %xmm3
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v16i32:			; SSSE3-LABEL: v16i32:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: movdqa %xmm1, %xmm8			; SSSE3-NEXT: movdqa %xmm1, %xmm8
	; SSSE3-NEXT: movdqa %xmm0, %xmm1			; SSSE3-NEXT: movdqa %xmm0, %xmm10
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,3]			; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
	; SSSE3-NEXT: movd %xmm0, %ecx			; SSSE3-NEXT: movdqa %xmm4, %xmm1
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]			; SSSE3-NEXT: pxor %xmm9, %xmm1
	; SSSE3-NEXT: movd %xmm0, %edx			; SSSE3-NEXT: pxor %xmm9, %xmm0
	; SSSE3-NEXT: xorl %eax, %eax			; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: pand %xmm0, %xmm10
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: pandn %xmm4, %xmm0
	; SSSE3-NEXT: movd %edx, %xmm9			; SSSE3-NEXT: por %xmm10, %xmm0
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]			; SSSE3-NEXT: psubd %xmm4, %xmm0
	; SSSE3-NEXT: movd %xmm0, %ecx			; SSSE3-NEXT: movdqa %xmm5, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]			; SSSE3-NEXT: pxor %xmm9, %xmm4
	; SSSE3-NEXT: movd %xmm0, %edx			; SSSE3-NEXT: movdqa %xmm8, %xmm1
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: pxor %xmm9, %xmm1
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
	; SSSE3-NEXT: movd %edx, %xmm10			; SSSE3-NEXT: pand %xmm1, %xmm8
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]			; SSSE3-NEXT: pandn %xmm5, %xmm1
	; SSSE3-NEXT: movd %xmm4, %ecx			; SSSE3-NEXT: por %xmm8, %xmm1
	; SSSE3-NEXT: movd %xmm1, %edx			; SSSE3-NEXT: psubd %xmm5, %xmm1
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: movdqa %xmm6, %xmm5
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: pxor %xmm9, %xmm5
	; SSSE3-NEXT: movd %edx, %xmm0			; SSSE3-NEXT: movdqa %xmm2, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]			; SSSE3-NEXT: pxor %xmm9, %xmm4
	; SSSE3-NEXT: movd %xmm4, %ecx			; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]			; SSSE3-NEXT: pand %xmm4, %xmm2
	; SSSE3-NEXT: movd %xmm1, %edx			; SSSE3-NEXT: pandn %xmm6, %xmm4
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: por %xmm2, %xmm4
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: psubd %xmm6, %xmm4
	; SSSE3-NEXT: movd %edx, %xmm1			; SSSE3-NEXT: movdqa %xmm7, %xmm2
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; SSSE3-NEXT: pxor %xmm9, %xmm2
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]			; SSSE3-NEXT: pxor %xmm3, %xmm9
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,3]			; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9
	; SSSE3-NEXT: movd %xmm1, %ecx			; SSSE3-NEXT: pand %xmm9, %xmm3
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3]			; SSSE3-NEXT: pandn %xmm7, %xmm9
	; SSSE3-NEXT: movd %xmm1, %edx			; SSSE3-NEXT: por %xmm9, %xmm3
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: psubd %xmm7, %xmm3
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm1
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
	; SSSE3-NEXT: movd %xmm4, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1]
	; SSSE3-NEXT: movd %xmm4, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm4
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
	; SSSE3-NEXT: movd %xmm5, %ecx
	; SSSE3-NEXT: movd %xmm8, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm1
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
	; SSSE3-NEXT: movd %xmm5, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3]
	; SSSE3-NEXT: movd %xmm5, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm5
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
	; SSSE3-NEXT: movd %xmm4, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,1,2,3]
	; SSSE3-NEXT: movd %xmm4, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
	; SSSE3-NEXT: movd %xmm5, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
	; SSSE3-NEXT: movd %xmm5, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm5
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
	; SSSE3-NEXT: movd %xmm6, %ecx
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
	; SSSE3-NEXT: movd %xmm6, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
	; SSSE3-NEXT: movd %xmm5, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
	; SSSE3-NEXT: movd %xmm5, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm6
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
	; SSSE3-NEXT: movd %xmm7, %ecx
	; SSSE3-NEXT: movd %xmm3, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm5
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
	; SSSE3-NEXT: movdqa %xmm4, %xmm2			; SSSE3-NEXT: movdqa %xmm4, %xmm2
	; SSSE3-NEXT: movdqa %xmm5, %xmm3
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v16i32:			; SSE41-LABEL: v16i32:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: movdqa %xmm1, %xmm8			; SSE41-NEXT: pmaxud %xmm4, %xmm0
	; SSE41-NEXT: movdqa %xmm0, %xmm1			; SSE41-NEXT: psubd %xmm4, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm4, %ecx			; SSE41-NEXT: pmaxud %xmm5, %xmm1
	; SSE41-NEXT: pextrd $1, %xmm0, %edx			; SSE41-NEXT: psubd %xmm5, %xmm1
	; SSE41-NEXT: xorl %eax, %eax			; SSE41-NEXT: pmaxud %xmm6, %xmm2
	; SSE41-NEXT: subl %ecx, %edx			; SSE41-NEXT: psubd %xmm6, %xmm2
	; SSE41-NEXT: cmovbl %eax, %edx			; SSE41-NEXT: pmaxud %xmm7, %xmm3
	; SSE41-NEXT: movd %xmm4, %ecx			; SSE41-NEXT: psubd %xmm7, %xmm3
	; SSE41-NEXT: movd %xmm0, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm0
	; SSE41-NEXT: pinsrd $1, %edx, %xmm0
	; SSE41-NEXT: pextrd $2, %xmm4, %ecx
	; SSE41-NEXT: pextrd $2, %xmm1, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm0
	; SSE41-NEXT: pextrd $3, %xmm4, %ecx
	; SSE41-NEXT: pextrd $3, %xmm1, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm5, %ecx
	; SSE41-NEXT: pextrd $1, %xmm8, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: movd %xmm5, %ecx
	; SSE41-NEXT: movd %xmm8, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm1
	; SSE41-NEXT: pinsrd $1, %edx, %xmm1
	; SSE41-NEXT: pextrd $2, %xmm5, %ecx
	; SSE41-NEXT: pextrd $2, %xmm8, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm1
	; SSE41-NEXT: pextrd $3, %xmm5, %ecx
	; SSE41-NEXT: pextrd $3, %xmm8, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm1
	; SSE41-NEXT: pextrd $1, %xmm6, %ecx
	; SSE41-NEXT: pextrd $1, %xmm2, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: movd %xmm6, %ecx
	; SSE41-NEXT: movd %xmm2, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm4
	; SSE41-NEXT: pinsrd $1, %edx, %xmm4
	; SSE41-NEXT: pextrd $2, %xmm6, %ecx
	; SSE41-NEXT: pextrd $2, %xmm2, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm4
	; SSE41-NEXT: pextrd $3, %xmm6, %ecx
	; SSE41-NEXT: pextrd $3, %xmm2, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm4
	; SSE41-NEXT: pextrd $1, %xmm7, %ecx
	; SSE41-NEXT: pextrd $1, %xmm3, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: movd %xmm7, %ecx
	; SSE41-NEXT: movd %xmm3, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm5
	; SSE41-NEXT: pinsrd $1, %edx, %xmm5
	; SSE41-NEXT: pextrd $2, %xmm7, %ecx
	; SSE41-NEXT: pextrd $2, %xmm3, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm5
	; SSE41-NEXT: pextrd $3, %xmm7, %ecx
	; SSE41-NEXT: pextrd $3, %xmm3, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm5
	; SSE41-NEXT: movdqa %xmm4, %xmm2
	; SSE41-NEXT: movdqa %xmm5, %xmm3
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: v16i32:			; AVX1-LABEL: v16i32:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4			; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
	; AVX1-NEXT: vpextrd $1, %xmm4, %ecx
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
	; AVX1-NEXT: vpextrd $1, %xmm5, %edx			; AVX1-NEXT: vpmaxud %xmm4, %xmm5, %xmm5
	; AVX1-NEXT: xorl %eax, %eax			; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
	; AVX1-NEXT: subl %ecx, %edx			; AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: cmovbl %eax, %edx			; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vmovd %xmm4, %ecx
	; AVX1-NEXT: vmovd %xmm5, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm6
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
	; AVX1-NEXT: vpextrd $2, %xmm4, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm5, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
	; AVX1-NEXT: vpextrd $3, %xmm4, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm5, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
	; AVX1-NEXT: vpextrd $1, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $1, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vmovd %xmm2, %ecx
	; AVX1-NEXT: vmovd %xmm0, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm5
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX1-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
	; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2			; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
	; AVX1-NEXT: vpextrd $1, %xmm2, %ecx
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4			; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
	; AVX1-NEXT: vpextrd $1, %xmm4, %edx			; AVX1-NEXT: vpmaxud %xmm2, %xmm4, %xmm4
	; AVX1-NEXT: subl %ecx, %edx			; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
	; AVX1-NEXT: cmovbl %eax, %edx			; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm1
	; AVX1-NEXT: vmovd %xmm2, %ecx			; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
	; AVX1-NEXT: vmovd %xmm4, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm5
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX1-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm4, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm4, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
	; AVX1-NEXT: vpextrd $1, %xmm3, %ecx
	; AVX1-NEXT: vpextrd $1, %xmm1, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vmovd %xmm3, %ecx
	; AVX1-NEXT: vmovd %xmm1, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm4
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX1-NEXT: vpextrd $2, %xmm3, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm1, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX1-NEXT: vpextrd $3, %xmm3, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm1, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1			; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: v16i32:			; AVX2-LABEL: v16i32:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4			; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0
	; AVX2-NEXT: vpextrd $1, %xmm4, %ecx			; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5			; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1
	; AVX2-NEXT: vpextrd $1, %xmm5, %edx			; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
	; AVX2-NEXT: xorl %eax, %eax
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm4, %ecx
	; AVX2-NEXT: vmovd %xmm5, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm6
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
	; AVX2-NEXT: vpextrd $2, %xmm4, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm5, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
	; AVX2-NEXT: vpextrd $3, %xmm4, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm5, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
	; AVX2-NEXT: vpextrd $1, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $1, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm2, %ecx
	; AVX2-NEXT: vmovd %xmm0, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm5
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX2-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX2-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
	; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
	; AVX2-NEXT: vpextrd $1, %xmm2, %ecx
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
	; AVX2-NEXT: vpextrd $1, %xmm4, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm2, %ecx
	; AVX2-NEXT: vmovd %xmm4, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm5
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX2-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm4, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX2-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm4, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
	; AVX2-NEXT: vpextrd $1, %xmm3, %ecx
	; AVX2-NEXT: vpextrd $1, %xmm1, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm3, %ecx
	; AVX2-NEXT: vmovd %xmm1, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm4
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX2-NEXT: vpextrd $2, %xmm3, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm1, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX2-NEXT: vpextrd $3, %xmm3, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm1, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: v16i32:			; AVX512-LABEL: v16i32:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2			; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
	; AVX512-NEXT: vpextrd $1, %xmm2, %ecx			; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
	; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3
	; AVX512-NEXT: vpextrd $1, %xmm3, %edx
	; AVX512-NEXT: xorl %eax, %eax
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm2, %ecx
	; AVX512-NEXT: vmovd %xmm3, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm4
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm3, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm3, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
	; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3
	; AVX512-NEXT: vpextrd $1, %xmm3, %ecx
	; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm4
	; AVX512-NEXT: vpextrd $1, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm3, %ecx
	; AVX512-NEXT: vmovd %xmm4, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm5
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX512-NEXT: vpextrd $2, %xmm3, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX512-NEXT: vpextrd $3, %xmm3, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
	; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
	; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
	; AVX512-NEXT: vpextrd $1, %xmm3, %ecx
	; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4
	; AVX512-NEXT: vpextrd $1, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm3, %ecx
	; AVX512-NEXT: vmovd %xmm4, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm5
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX512-NEXT: vpextrd $2, %xmm3, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX512-NEXT: vpextrd $3, %xmm3, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
	; AVX512-NEXT: vpextrd $1, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $1, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm1, %ecx
	; AVX512-NEXT: vmovd %xmm0, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm4
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $2, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $3, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0
	; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%z = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)			%z = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
	ret <16 x i32> %z			ret <16 x i32> %z
	}			}

	define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {			define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
	; SSE2-LABEL: v2i64:			; SSE2-LABEL: v2i64:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: movq %xmm1, %rax			; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: movdqa %xmm1, %xmm3
	; SSE2-NEXT: xorl %edx, %edx			; SSE2-NEXT: pxor %xmm2, %xmm3
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pxor %xmm0, %xmm2
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: movdqa %xmm2, %xmm4
	; SSE2-NEXT: movq %rcx, %xmm2			; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSE2-NEXT: movq %xmm1, %rax			; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: pand %xmm5, %xmm2
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: por %xmm2, %xmm3
	; SSE2-NEXT: movq %rcx, %xmm0			; SSE2-NEXT: pand %xmm3, %xmm0
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSE2-NEXT: pandn %xmm1, %xmm3
	; SSE2-NEXT: movdqa %xmm2, %xmm0			; SSE2-NEXT: por %xmm3, %xmm0
				; SSE2-NEXT: psubq %xmm1, %xmm0
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v2i64:			; SSSE3-LABEL: v2i64:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: movq %xmm1, %rax			; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: movdqa %xmm1, %xmm3
	; SSSE3-NEXT: xorl %edx, %edx			; SSSE3-NEXT: pxor %xmm2, %xmm3
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pxor %xmm0, %xmm2
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: movdqa %xmm2, %xmm4
	; SSSE3-NEXT: movq %rcx, %xmm2			; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSSE3-NEXT: movq %xmm1, %rax			; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: pand %xmm5, %xmm2
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: por %xmm2, %xmm3
	; SSSE3-NEXT: movq %rcx, %xmm0			; SSSE3-NEXT: pand %xmm3, %xmm0
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSSE3-NEXT: pandn %xmm1, %xmm3
	; SSSE3-NEXT: movdqa %xmm2, %xmm0			; SSSE3-NEXT: por %xmm3, %xmm0
				; SSSE3-NEXT: psubq %xmm1, %xmm0
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v2i64:			; SSE41-LABEL: v2i64:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: pextrq $1, %xmm1, %rax			; SSE41-NEXT: movdqa %xmm0, %xmm2
	; SSE41-NEXT: pextrq $1, %xmm0, %rcx			; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
	; SSE41-NEXT: xorl %edx, %edx			; SSE41-NEXT: movdqa %xmm1, %xmm3
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pxor %xmm0, %xmm3
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: pxor %xmm2, %xmm0
	; SSE41-NEXT: movq %rcx, %xmm2			; SSE41-NEXT: movdqa %xmm0, %xmm4
	; SSE41-NEXT: movq %xmm1, %rax			; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
	; SSE41-NEXT: movq %xmm0, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
	; SSE41-NEXT: movq %rcx, %xmm0			; SSE41-NEXT: pand %xmm5, %xmm0
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]			; SSE41-NEXT: por %xmm4, %xmm0
				; SSE41-NEXT: movdqa %xmm1, %xmm3
				; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
				; SSE41-NEXT: psubq %xmm1, %xmm3
				; SSE41-NEXT: movdqa %xmm3, %xmm0
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: v2i64:			; AVX1-LABEL: v2i64:
	; AVX: # %bb.0:			; AVX1: # %bb.0:
	; AVX-NEXT: vpextrq $1, %xmm1, %rax			; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
	; AVX-NEXT: vpextrq $1, %xmm0, %rcx			; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
	; AVX-NEXT: xorl %edx, %edx			; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
	; AVX-NEXT: subq %rax, %rcx			; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
	; AVX-NEXT: cmovbq %rdx, %rcx			; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
	; AVX-NEXT: vmovq %rcx, %xmm2			; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vmovq %xmm1, %rax			; AVX1-NEXT: retq
	; AVX-NEXT: vmovq %xmm0, %rcx			;
	; AVX-NEXT: subq %rax, %rcx			; AVX2-LABEL: v2i64:
	; AVX-NEXT: cmovbq %rdx, %rcx			; AVX2: # %bb.0:
	; AVX-NEXT: vmovq %rcx, %xmm0			; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
	; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]			; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
	; AVX-NEXT: retq			; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
				; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
				; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
				; AVX2-NEXT: retq
				;
				; AVX512-LABEL: v2i64:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
				; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
				; AVX512-NEXT: retq
	%z = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)			%z = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
	ret <2 x i64> %z			ret <2 x i64> %z
	}			}

	define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {			define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
	; SSE2-LABEL: v4i64:			; SSE2-LABEL: v4i64:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: movq %xmm2, %rax			; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: movdqa %xmm2, %xmm5
	; SSE2-NEXT: xorl %edx, %edx			; SSE2-NEXT: pxor %xmm4, %xmm5
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: movdqa %xmm0, %xmm6
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: pxor %xmm4, %xmm6
	; SSE2-NEXT: movq %rcx, %xmm4			; SSE2-NEXT: movdqa %xmm6, %xmm7
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]			; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
	; SSE2-NEXT: movq %xmm2, %rax			; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pand %xmm8, %xmm5
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
	; SSE2-NEXT: movq %rcx, %xmm0			; SSE2-NEXT: por %xmm5, %xmm6
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]			; SSE2-NEXT: pand %xmm6, %xmm0
	; SSE2-NEXT: movq %xmm3, %rax			; SSE2-NEXT: pandn %xmm2, %xmm6
	; SSE2-NEXT: movq %xmm1, %rcx			; SSE2-NEXT: por %xmm6, %xmm0
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: psubq %xmm2, %xmm0
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: movdqa %xmm3, %xmm2
	; SSE2-NEXT: movq %rcx, %xmm2			; SSE2-NEXT: pxor %xmm4, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]			; SSE2-NEXT: pxor %xmm1, %xmm4
	; SSE2-NEXT: movq %xmm0, %rax			; SSE2-NEXT: movdqa %xmm4, %xmm5
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]			; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
	; SSE2-NEXT: movq %rcx, %xmm0			; SSE2-NEXT: pand %xmm6, %xmm2
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
	; SSE2-NEXT: movdqa %xmm4, %xmm0			; SSE2-NEXT: por %xmm2, %xmm4
	; SSE2-NEXT: movdqa %xmm2, %xmm1			; SSE2-NEXT: pand %xmm4, %xmm1
				; SSE2-NEXT: pandn %xmm3, %xmm4
				; SSE2-NEXT: por %xmm4, %xmm1
				; SSE2-NEXT: psubq %xmm3, %xmm1
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v4i64:			; SSSE3-LABEL: v4i64:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: movq %xmm2, %rax			; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: movdqa %xmm2, %xmm5
	; SSSE3-NEXT: xorl %edx, %edx			; SSSE3-NEXT: pxor %xmm4, %xmm5
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: movdqa %xmm0, %xmm6
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: pxor %xmm4, %xmm6
	; SSSE3-NEXT: movq %rcx, %xmm4			; SSSE3-NEXT: movdqa %xmm6, %xmm7
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]			; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
	; SSSE3-NEXT: movq %xmm2, %rax			; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pand %xmm8, %xmm5
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
	; SSSE3-NEXT: movq %rcx, %xmm0			; SSSE3-NEXT: por %xmm5, %xmm6
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]			; SSSE3-NEXT: pand %xmm6, %xmm0
	; SSSE3-NEXT: movq %xmm3, %rax			; SSSE3-NEXT: pandn %xmm2, %xmm6
	; SSSE3-NEXT: movq %xmm1, %rcx			; SSSE3-NEXT: por %xmm6, %xmm0
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: psubq %xmm2, %xmm0
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: movdqa %xmm3, %xmm2
	; SSSE3-NEXT: movq %rcx, %xmm2			; SSSE3-NEXT: pxor %xmm4, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]			; SSSE3-NEXT: pxor %xmm1, %xmm4
	; SSSE3-NEXT: movq %xmm0, %rax			; SSSE3-NEXT: movdqa %xmm4, %xmm5
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]			; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
	; SSSE3-NEXT: movq %rcx, %xmm0			; SSSE3-NEXT: pand %xmm6, %xmm2
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
	; SSSE3-NEXT: movdqa %xmm4, %xmm0			; SSSE3-NEXT: por %xmm2, %xmm4
	; SSSE3-NEXT: movdqa %xmm2, %xmm1			; SSSE3-NEXT: pand %xmm4, %xmm1
				; SSSE3-NEXT: pandn %xmm3, %xmm4
				; SSSE3-NEXT: por %xmm4, %xmm1
				; SSSE3-NEXT: psubq %xmm3, %xmm1
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v4i64:			; SSE41-LABEL: v4i64:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: pextrq $1, %xmm2, %rax			; SSE41-NEXT: movdqa %xmm0, %xmm4
	; SSE41-NEXT: pextrq $1, %xmm0, %rcx			; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
	; SSE41-NEXT: xorl %edx, %edx			; SSE41-NEXT: movdqa %xmm2, %xmm5
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pxor %xmm6, %xmm5
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: movdqa %xmm0, %xmm7
	; SSE41-NEXT: movq %rcx, %xmm4			; SSE41-NEXT: pxor %xmm6, %xmm7
	; SSE41-NEXT: movq %xmm2, %rax			; SSE41-NEXT: movdqa %xmm7, %xmm0
	; SSE41-NEXT: movq %xmm0, %rcx			; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: pcmpeqd %xmm5, %xmm7
	; SSE41-NEXT: movq %rcx, %xmm0			; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]			; SSE41-NEXT: pand %xmm8, %xmm5
	; SSE41-NEXT: pextrq $1, %xmm3, %rax			; SSE41-NEXT: por %xmm5, %xmm0
	; SSE41-NEXT: pextrq $1, %xmm1, %rcx			; SSE41-NEXT: movdqa %xmm2, %xmm5
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: psubq %xmm2, %xmm5
	; SSE41-NEXT: movq %rcx, %xmm2			; SSE41-NEXT: movdqa %xmm3, %xmm0
	; SSE41-NEXT: movq %xmm3, %rax			; SSE41-NEXT: pxor %xmm6, %xmm0
	; SSE41-NEXT: movq %xmm1, %rcx			; SSE41-NEXT: pxor %xmm1, %xmm6
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: movdqa %xmm6, %xmm2
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
	; SSE41-NEXT: movq %rcx, %xmm1			; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]			; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
				; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
				; SSE41-NEXT: pand %xmm4, %xmm0
				; SSE41-NEXT: por %xmm2, %xmm0
				; SSE41-NEXT: movdqa %xmm3, %xmm2
				; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
				; SSE41-NEXT: psubq %xmm3, %xmm2
				; SSE41-NEXT: movdqa %xmm5, %xmm0
				; SSE41-NEXT: movdqa %xmm2, %xmm1
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: v4i64:			; AVX1-LABEL: v4i64:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2			; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpextrq $1, %xmm2, %rax			; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
				; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
				; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
				; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm5
				; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
				; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
				; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
				; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm3
				; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
				; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpextrq $1, %xmm3, %rcx			; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
	; AVX1-NEXT: xorl %edx, %edx			; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: subq %rax, %rcx
	; AVX1-NEXT: cmovbq %rdx, %rcx
	; AVX1-NEXT: vmovq %rcx, %xmm4
	; AVX1-NEXT: vmovq %xmm2, %rax
	; AVX1-NEXT: vmovq %xmm3, %rcx
	; AVX1-NEXT: subq %rax, %rcx
	; AVX1-NEXT: cmovbq %rdx, %rcx
	; AVX1-NEXT: vmovq %rcx, %xmm2
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; AVX1-NEXT: vpextrq $1, %xmm1, %rax
	; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
	; AVX1-NEXT: subq %rax, %rcx
	; AVX1-NEXT: cmovbq %rdx, %rcx
	; AVX1-NEXT: vmovq %rcx, %xmm3
	; AVX1-NEXT: vmovq %xmm1, %rax
	; AVX1-NEXT: vmovq %xmm0, %rcx
	; AVX1-NEXT: subq %rax, %rcx
	; AVX1-NEXT: cmovbq %rdx, %rcx
	; AVX1-NEXT: vmovq %rcx, %xmm0
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: v4i64:			; AVX2-LABEL: v4i64:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2			; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
	; AVX2-NEXT: vpextrq $1, %xmm2, %rax			; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3			; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
	; AVX2-NEXT: vpextrq $1, %xmm3, %rcx			; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
	; AVX2-NEXT: xorl %edx, %edx			; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: subq %rax, %rcx			; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: cmovbq %rdx, %rcx
	; AVX2-NEXT: vmovq %rcx, %xmm4
	; AVX2-NEXT: vmovq %xmm2, %rax
	; AVX2-NEXT: vmovq %xmm3, %rcx
	; AVX2-NEXT: subq %rax, %rcx
	; AVX2-NEXT: cmovbq %rdx, %rcx
	; AVX2-NEXT: vmovq %rcx, %xmm2
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; AVX2-NEXT: vpextrq $1, %xmm1, %rax
	; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
	; AVX2-NEXT: subq %rax, %rcx
	; AVX2-NEXT: cmovbq %rdx, %rcx
	; AVX2-NEXT: vmovq %rcx, %xmm3
	; AVX2-NEXT: vmovq %xmm1, %rax
	; AVX2-NEXT: vmovq %xmm0, %rcx
	; AVX2-NEXT: subq %rax, %rcx
	; AVX2-NEXT: cmovbq %rdx, %rcx
	; AVX2-NEXT: vmovq %rcx, %xmm0
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: v4i64:			; AVX512-LABEL: v4i64:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2			; AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
	; AVX512-NEXT: vpextrq $1, %xmm2, %rax			; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
	; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
	; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
	; AVX512-NEXT: xorl %edx, %edx
	; AVX512-NEXT: subq %rax, %rcx
	; AVX512-NEXT: cmovbq %rdx, %rcx
	; AVX512-NEXT: vmovq %rcx, %xmm4
	; AVX512-NEXT: vmovq %xmm2, %rax
	; AVX512-NEXT: vmovq %xmm3, %rcx
	; AVX512-NEXT: subq %rax, %rcx
	; AVX512-NEXT: cmovbq %rdx, %rcx
	; AVX512-NEXT: vmovq %rcx, %xmm2
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; AVX512-NEXT: vpextrq $1, %xmm1, %rax
	; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
	; AVX512-NEXT: subq %rax, %rcx
	; AVX512-NEXT: cmovbq %rdx, %rcx
	; AVX512-NEXT: vmovq %rcx, %xmm3
	; AVX512-NEXT: vmovq %xmm1, %rax
	; AVX512-NEXT: vmovq %xmm0, %rcx
	; AVX512-NEXT: subq %rax, %rcx
	; AVX512-NEXT: cmovbq %rdx, %rcx
	; AVX512-NEXT: vmovq %rcx, %xmm0
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
	; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%z = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)			%z = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
	ret <4 x i64> %z			ret <4 x i64> %z
	}			}

	define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {			define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
	; SSE2-LABEL: v8i64:			; SSE2-LABEL: v8i64:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: movdqa %xmm1, %xmm8			; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
	; SSE2-NEXT: movdqa %xmm0, %xmm1			; SSE2-NEXT: movdqa %xmm4, %xmm9
	; SSE2-NEXT: movq %xmm4, %rcx			; SSE2-NEXT: pxor %xmm8, %xmm9
	; SSE2-NEXT: movq %xmm0, %rdx			; SSE2-NEXT: movdqa %xmm0, %xmm10
	; SSE2-NEXT: xorl %eax, %eax			; SSE2-NEXT: pxor %xmm8, %xmm10
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: movdqa %xmm10, %xmm11
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: pcmpgtd %xmm9, %xmm11
	; SSE2-NEXT: movq %rdx, %xmm0			; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]			; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
	; SSE2-NEXT: movq %xmm4, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSE2-NEXT: pand %xmm12, %xmm9
	; SSE2-NEXT: movq %xmm1, %rdx			; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: por %xmm9, %xmm10
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: pand %xmm10, %xmm0
	; SSE2-NEXT: movq %rdx, %xmm1			; SSE2-NEXT: pandn %xmm4, %xmm10
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; SSE2-NEXT: por %xmm10, %xmm0
	; SSE2-NEXT: movq %xmm5, %rcx			; SSE2-NEXT: psubq %xmm4, %xmm0
	; SSE2-NEXT: movq %xmm8, %rdx			; SSE2-NEXT: movdqa %xmm5, %xmm9
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: pxor %xmm8, %xmm9
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: movdqa %xmm1, %xmm4
	; SSE2-NEXT: movq %rdx, %xmm1			; SSE2-NEXT: pxor %xmm8, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]			; SSE2-NEXT: movdqa %xmm4, %xmm10
	; SSE2-NEXT: movq %xmm4, %rcx			; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1]			; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
	; SSE2-NEXT: movq %xmm4, %rdx			; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: pand %xmm11, %xmm9
	; SSE2-NEXT: movq %rdx, %xmm4			; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]			; SSE2-NEXT: por %xmm9, %xmm4
	; SSE2-NEXT: movq %xmm6, %rcx			; SSE2-NEXT: pand %xmm4, %xmm1
	; SSE2-NEXT: movq %xmm2, %rdx			; SSE2-NEXT: pandn %xmm5, %xmm4
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: por %xmm4, %xmm1
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: psubq %xmm5, %xmm1
	; SSE2-NEXT: movq %rdx, %xmm4			; SSE2-NEXT: movdqa %xmm6, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]			; SSE2-NEXT: pxor %xmm8, %xmm4
	; SSE2-NEXT: movq %xmm5, %rcx			; SSE2-NEXT: movdqa %xmm2, %xmm5
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]			; SSE2-NEXT: pxor %xmm8, %xmm5
	; SSE2-NEXT: movq %xmm2, %rdx			; SSE2-NEXT: movdqa %xmm5, %xmm9
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
	; SSE2-NEXT: movq %rdx, %xmm2			; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]			; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
	; SSE2-NEXT: movq %xmm7, %rcx			; SSE2-NEXT: pand %xmm10, %xmm4
	; SSE2-NEXT: movq %xmm3, %rdx			; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: por %xmm4, %xmm5
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: pand %xmm5, %xmm2
	; SSE2-NEXT: movq %rdx, %xmm5			; SSE2-NEXT: pandn %xmm6, %xmm5
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1]			; SSE2-NEXT: por %xmm5, %xmm2
	; SSE2-NEXT: movq %xmm2, %rcx			; SSE2-NEXT: psubq %xmm6, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]			; SSE2-NEXT: movdqa %xmm7, %xmm4
	; SSE2-NEXT: movq %xmm2, %rdx			; SSE2-NEXT: pxor %xmm8, %xmm4
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: pxor %xmm3, %xmm8
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: movdqa %xmm8, %xmm5
	; SSE2-NEXT: movq %rdx, %xmm2			; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]			; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
	; SSE2-NEXT: movdqa %xmm4, %xmm2			; SSE2-NEXT: pcmpeqd %xmm4, %xmm8
	; SSE2-NEXT: movdqa %xmm5, %xmm3			; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
				; SSE2-NEXT: pand %xmm6, %xmm4
				; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
				; SSE2-NEXT: por %xmm4, %xmm5
				; SSE2-NEXT: pand %xmm5, %xmm3
				; SSE2-NEXT: pandn %xmm7, %xmm5
				; SSE2-NEXT: por %xmm5, %xmm3
				; SSE2-NEXT: psubq %xmm7, %xmm3
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v8i64:			; SSSE3-LABEL: v8i64:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: movdqa %xmm1, %xmm8			; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
	; SSSE3-NEXT: movdqa %xmm0, %xmm1			; SSSE3-NEXT: movdqa %xmm4, %xmm9
	; SSSE3-NEXT: movq %xmm4, %rcx			; SSSE3-NEXT: pxor %xmm8, %xmm9
	; SSSE3-NEXT: movq %xmm0, %rdx			; SSSE3-NEXT: movdqa %xmm0, %xmm10
	; SSSE3-NEXT: xorl %eax, %eax			; SSSE3-NEXT: pxor %xmm8, %xmm10
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: movdqa %xmm10, %xmm11
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11
	; SSSE3-NEXT: movq %rdx, %xmm0			; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]			; SSSE3-NEXT: pcmpeqd %xmm9, %xmm10
	; SSSE3-NEXT: movq %xmm4, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSSE3-NEXT: pand %xmm12, %xmm9
	; SSSE3-NEXT: movq %xmm1, %rdx			; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: por %xmm9, %xmm10
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: pand %xmm10, %xmm0
	; SSSE3-NEXT: movq %rdx, %xmm1			; SSSE3-NEXT: pandn %xmm4, %xmm10
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; SSSE3-NEXT: por %xmm10, %xmm0
	; SSSE3-NEXT: movq %xmm5, %rcx			; SSSE3-NEXT: psubq %xmm4, %xmm0
	; SSSE3-NEXT: movq %xmm8, %rdx			; SSSE3-NEXT: movdqa %xmm5, %xmm9
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: pxor %xmm8, %xmm9
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: movdqa %xmm1, %xmm4
	; SSSE3-NEXT: movq %rdx, %xmm1			; SSSE3-NEXT: pxor %xmm8, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]			; SSSE3-NEXT: movdqa %xmm4, %xmm10
	; SSSE3-NEXT: movq %xmm4, %rcx			; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1]			; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
	; SSSE3-NEXT: movq %xmm4, %rdx			; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: pand %xmm11, %xmm9
	; SSSE3-NEXT: movq %rdx, %xmm4			; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]			; SSSE3-NEXT: por %xmm9, %xmm4
	; SSSE3-NEXT: movq %xmm6, %rcx			; SSSE3-NEXT: pand %xmm4, %xmm1
	; SSSE3-NEXT: movq %xmm2, %rdx			; SSSE3-NEXT: pandn %xmm5, %xmm4
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: por %xmm4, %xmm1
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: psubq %xmm5, %xmm1
	; SSSE3-NEXT: movq %rdx, %xmm4			; SSSE3-NEXT: movdqa %xmm6, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]			; SSSE3-NEXT: pxor %xmm8, %xmm4
	; SSSE3-NEXT: movq %xmm5, %rcx			; SSSE3-NEXT: movdqa %xmm2, %xmm5
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]			; SSSE3-NEXT: pxor %xmm8, %xmm5
	; SSSE3-NEXT: movq %xmm2, %rdx			; SSSE3-NEXT: movdqa %xmm5, %xmm9
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
	; SSSE3-NEXT: movq %rdx, %xmm2			; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]			; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
	; SSSE3-NEXT: movq %xmm7, %rcx			; SSSE3-NEXT: pand %xmm10, %xmm4
	; SSSE3-NEXT: movq %xmm3, %rdx			; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: por %xmm4, %xmm5
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: pand %xmm5, %xmm2
	; SSSE3-NEXT: movq %rdx, %xmm5			; SSSE3-NEXT: pandn %xmm6, %xmm5
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1]			; SSSE3-NEXT: por %xmm5, %xmm2
	; SSSE3-NEXT: movq %xmm2, %rcx			; SSSE3-NEXT: psubq %xmm6, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]			; SSSE3-NEXT: movdqa %xmm7, %xmm4
	; SSSE3-NEXT: movq %xmm2, %rdx			; SSSE3-NEXT: pxor %xmm8, %xmm4
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: pxor %xmm3, %xmm8
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: movdqa %xmm8, %xmm5
	; SSSE3-NEXT: movq %rdx, %xmm2			; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]			; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
	; SSSE3-NEXT: movdqa %xmm4, %xmm2			; SSSE3-NEXT: pcmpeqd %xmm4, %xmm8
	; SSSE3-NEXT: movdqa %xmm5, %xmm3			; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
				; SSSE3-NEXT: pand %xmm6, %xmm4
				; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
				; SSSE3-NEXT: por %xmm4, %xmm5
				; SSSE3-NEXT: pand %xmm5, %xmm3
				; SSSE3-NEXT: pandn %xmm7, %xmm5
				; SSSE3-NEXT: por %xmm5, %xmm3
				; SSSE3-NEXT: psubq %xmm7, %xmm3
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v8i64:			; SSE41-LABEL: v8i64:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: pextrq $1, %xmm4, %rcx			; SSE41-NEXT: movdqa %xmm1, %xmm8
	; SSE41-NEXT: pextrq $1, %xmm0, %rdx			; SSE41-NEXT: movdqa %xmm0, %xmm11
	; SSE41-NEXT: xorl %eax, %eax			; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: movdqa %xmm4, %xmm9
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: pxor %xmm10, %xmm9
	; SSE41-NEXT: movq %rdx, %xmm8			; SSE41-NEXT: movdqa %xmm0, %xmm1
	; SSE41-NEXT: movq %xmm4, %rcx			; SSE41-NEXT: pxor %xmm10, %xmm1
	; SSE41-NEXT: movq %xmm0, %rdx			; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
	; SSE41-NEXT: movq %rdx, %xmm0			; SSE41-NEXT: pcmpeqd %xmm9, %xmm1
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0]			; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
	; SSE41-NEXT: pextrq $1, %xmm5, %rcx			; SSE41-NEXT: pand %xmm12, %xmm1
	; SSE41-NEXT: pextrq $1, %xmm1, %rdx			; SSE41-NEXT: por %xmm1, %xmm0
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: movdqa %xmm4, %xmm9
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm9
	; SSE41-NEXT: movq %rdx, %xmm4			; SSE41-NEXT: psubq %xmm4, %xmm9
	; SSE41-NEXT: movq %xmm5, %rcx			; SSE41-NEXT: movdqa %xmm5, %xmm0
	; SSE41-NEXT: movq %xmm1, %rdx			; SSE41-NEXT: pxor %xmm10, %xmm0
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: movdqa %xmm8, %xmm1
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: pxor %xmm10, %xmm1
	; SSE41-NEXT: movq %rdx, %xmm1			; SSE41-NEXT: movdqa %xmm1, %xmm4
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]			; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
	; SSE41-NEXT: pextrq $1, %xmm6, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
	; SSE41-NEXT: pextrq $1, %xmm2, %rdx			; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: pand %xmm11, %xmm0
	; SSE41-NEXT: movq %rdx, %xmm4			; SSE41-NEXT: por %xmm4, %xmm0
	; SSE41-NEXT: movq %xmm6, %rcx			; SSE41-NEXT: movdqa %xmm5, %xmm1
	; SSE41-NEXT: movq %xmm2, %rdx			; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: psubq %xmm5, %xmm1
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: movdqa %xmm6, %xmm0
	; SSE41-NEXT: movq %rdx, %xmm2			; SSE41-NEXT: pxor %xmm10, %xmm0
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]			; SSE41-NEXT: movdqa %xmm2, %xmm4
	; SSE41-NEXT: pextrq $1, %xmm7, %rcx			; SSE41-NEXT: pxor %xmm10, %xmm4
	; SSE41-NEXT: pextrq $1, %xmm3, %rdx			; SSE41-NEXT: movdqa %xmm4, %xmm5
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
	; SSE41-NEXT: movq %rdx, %xmm4			; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
	; SSE41-NEXT: movq %xmm7, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
	; SSE41-NEXT: movq %xmm3, %rdx			; SSE41-NEXT: pand %xmm8, %xmm0
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: por %xmm5, %xmm0
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: movdqa %xmm6, %xmm4
	; SSE41-NEXT: movq %rdx, %xmm3			; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]			; SSE41-NEXT: psubq %xmm6, %xmm4
				; SSE41-NEXT: movdqa %xmm7, %xmm0
				; SSE41-NEXT: pxor %xmm10, %xmm0
				; SSE41-NEXT: pxor %xmm3, %xmm10
				; SSE41-NEXT: movdqa %xmm10, %xmm2
				; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
				; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
				; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
				; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
				; SSE41-NEXT: pand %xmm5, %xmm0
				; SSE41-NEXT: por %xmm2, %xmm0
				; SSE41-NEXT: movdqa %xmm7, %xmm5
				; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
				; SSE41-NEXT: psubq %xmm7, %xmm5
				; SSE41-NEXT: movdqa %xmm9, %xmm0
				; SSE41-NEXT: movdqa %xmm4, %xmm2
				; SSE41-NEXT: movdqa %xmm5, %xmm3
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: v8i64:			; AVX1-LABEL: v8i64:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4			; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
	; AVX1-NEXT: vpextrq $1, %xmm4, %rcx			; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5			; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6
	; AVX1-NEXT: vpextrq $1, %xmm5, %rdx			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
	; AVX1-NEXT: xorl %eax, %eax			; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
	; AVX1-NEXT: subq %rcx, %rdx			; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm8
	; AVX1-NEXT: cmovbq %rax, %rdx			; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm7
	; AVX1-NEXT: vmovq %rdx, %xmm6			; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm6
	; AVX1-NEXT: vmovq %xmm4, %rcx			; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6
	; AVX1-NEXT: vmovq %xmm5, %rdx			; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
	; AVX1-NEXT: subq %rcx, %rdx			; AVX1-NEXT: vblendvpd %ymm6, %ymm0, %ymm2, %ymm0
	; AVX1-NEXT: cmovbq %rax, %rdx			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
	; AVX1-NEXT: vmovq %rdx, %xmm4			; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm4
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]			; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vpextrq $1, %xmm2, %rcx
	; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
	; AVX1-NEXT: subq %rcx, %rdx
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm5
	; AVX1-NEXT: vmovq %xmm2, %rcx
	; AVX1-NEXT: vmovq %xmm0, %rdx
	; AVX1-NEXT: subq %rcx, %rdx
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm0
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
	; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2			; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
	; AVX1-NEXT: vpextrq $1, %xmm2, %rcx			; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4
				; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
				; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
				; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
				; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm6
				; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm5
				; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm5
				; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
				; AVX1-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4			; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
	; AVX1-NEXT: vpextrq $1, %xmm4, %rdx			; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
	; AVX1-NEXT: subq %rcx, %rdx			; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm5
	; AVX1-NEXT: vmovq %xmm2, %rcx
	; AVX1-NEXT: vmovq %xmm4, %rdx
	; AVX1-NEXT: subq %rcx, %rdx
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm2
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
	; AVX1-NEXT: vpextrq $1, %xmm3, %rcx
	; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
	; AVX1-NEXT: subq %rcx, %rdx
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm4
	; AVX1-NEXT: vmovq %xmm3, %rcx
	; AVX1-NEXT: vmovq %xmm1, %rdx
	; AVX1-NEXT: subq %rcx, %rdx
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm1
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1			; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: v8i64:			; AVX2-LABEL: v8i64:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4			; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
	; AVX2-NEXT: vpextrq $1, %xmm4, %rcx			; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5			; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm6
	; AVX2-NEXT: vpextrq $1, %xmm5, %rdx			; AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5
	; AVX2-NEXT: xorl %eax, %eax			; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
	; AVX2-NEXT: subq %rcx, %rdx			; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
	; AVX2-NEXT: cmovbq %rax, %rdx			; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm2
	; AVX2-NEXT: vmovq %rdx, %xmm6			; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm4
	; AVX2-NEXT: vmovq %xmm4, %rcx			; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm2
	; AVX2-NEXT: vmovq %xmm5, %rdx			; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm3, %ymm1
	; AVX2-NEXT: subq %rcx, %rdx			; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm4
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
	; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
	; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm5
	; AVX2-NEXT: vmovq %xmm2, %rcx
	; AVX2-NEXT: vmovq %xmm0, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm0
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
	; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
	; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
	; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
	; AVX2-NEXT: vpextrq $1, %xmm4, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm5
	; AVX2-NEXT: vmovq %xmm2, %rcx
	; AVX2-NEXT: vmovq %xmm4, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm2
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
	; AVX2-NEXT: vpextrq $1, %xmm3, %rcx
	; AVX2-NEXT: vpextrq $1, %xmm1, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm4
	; AVX2-NEXT: vmovq %xmm3, %rcx
	; AVX2-NEXT: vmovq %xmm1, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm1
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: v8i64:			; AVX512-LABEL: v8i64:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2			; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
	; AVX512-NEXT: vpextrq $1, %xmm2, %rcx			; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
	; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3
	; AVX512-NEXT: vpextrq $1, %xmm3, %rdx
	; AVX512-NEXT: xorl %eax, %eax
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm4
	; AVX512-NEXT: vmovq %xmm2, %rcx
	; AVX512-NEXT: vmovq %xmm3, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm2
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3
	; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
	; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm4
	; AVX512-NEXT: vpextrq $1, %xmm4, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm5
	; AVX512-NEXT: vmovq %xmm3, %rcx
	; AVX512-NEXT: vmovq %xmm4, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm3
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
	; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
	; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
	; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
	; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4
	; AVX512-NEXT: vpextrq $1, %xmm4, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm5
	; AVX512-NEXT: vmovq %xmm3, %rcx
	; AVX512-NEXT: vmovq %xmm4, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm3
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
	; AVX512-NEXT: vpextrq $1, %xmm1, %rcx
	; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm4
	; AVX512-NEXT: vmovq %xmm1, %rcx
	; AVX512-NEXT: vmovq %xmm0, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm0
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
	; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%z = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)			%z = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
	ret <8 x i64> %z			ret <8 x i64> %z
	}			}

	define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {			define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
	; SSE-LABEL: v2i128:			; SSE-LABEL: v2i128:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	Show All 36 Lines

test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll

Show All 20 Lines
@c8 = common global [64 x i8] zeroinitializer, align 64		@c8 = common global [64 x i8] zeroinitializer, align 64

declare i64 @llvm.usub.sat.i64(i64, i64)		declare i64 @llvm.usub.sat.i64(i64, i64)
declare i32 @llvm.usub.sat.i32(i32, i32)		declare i32 @llvm.usub.sat.i32(i32, i32)
declare i16 @llvm.usub.sat.i16(i16, i16)		declare i16 @llvm.usub.sat.i16(i16, i16)
declare i8 @llvm.usub.sat.i8 (i8 , i8 )		declare i8 @llvm.usub.sat.i8 (i8 , i8 )

define void @sub_v8i64() {		define void @sub_v8i64() {
; CHECK-LABEL: @sub_v8i64(		; SSE-LABEL: @sub_v8i64(
; CHECK-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		; SSE-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
; CHECK-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		; SSE-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
; CHECK-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		; SSE-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
; CHECK-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		; SSE-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
; CHECK-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		; SSE-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
; CHECK-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		; SSE-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
; CHECK-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8		; SSE-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
; CHECK-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8		; SSE-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
; CHECK-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8		; SSE-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
; CHECK-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8		; SSE-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
; CHECK-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8		; SSE-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
; CHECK-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8		; SSE-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
; CHECK-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8		; SSE-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
; CHECK-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8		; SSE-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
; CHECK-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8		; SSE-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
; CHECK-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8		; SSE-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
; CHECK-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])		; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
; CHECK-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])		; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
; CHECK-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])		; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
; CHECK-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])		; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
; CHECK-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])		; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
; CHECK-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])		; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
; CHECK-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])		; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
; CHECK-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])		; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
; CHECK-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8		; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
; CHECK-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8		; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
; CHECK-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8		; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
; CHECK-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8		; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
; CHECK-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
; CHECK-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
; CHECK-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
; CHECK-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; CHECK-NEXT: ret void		; SSE-NEXT: ret void
		;
		; SLM-LABEL: @sub_v8i64(
		; SLM-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
		; SLM-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
		; SLM-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
		; SLM-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
		; SLM-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
		; SLM-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
		; SLM-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
		; SLM-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
		; SLM-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
		; SLM-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
		; SLM-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
		; SLM-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
		; SLM-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
		; SLM-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
		; SLM-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
		; SLM-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
		; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
		; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
		; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
		; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
		; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
		; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
		; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
		; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
		; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
		; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
		; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
		; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
		; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
		; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
		; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
		; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
		; SLM-NEXT: ret void
		;
		; AVX1-LABEL: @sub_v8i64(
		; AVX1-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
		; AVX1-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
		; AVX1-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
		; AVX1-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
		; AVX1-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
		; AVX1-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
		; AVX1-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
		; AVX1-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
		; AVX1-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
		; AVX1-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
		; AVX1-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
		; AVX1-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
		; AVX1-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
		; AVX1-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
		; AVX1-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
		; AVX1-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
		; AVX1-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
		; AVX1-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
		; AVX1-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
		; AVX1-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
		; AVX1-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
		; AVX1-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
		; AVX1-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
		; AVX1-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
		; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
		; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
		; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
		; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
		; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
		; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
		; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
		; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
		; AVX1-NEXT: ret void
		;
		; AVX2-LABEL: @sub_v8i64(
		; AVX2-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
		; AVX2-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
		; AVX2-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
		; AVX2-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
		; AVX2-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
		; AVX2-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
		; AVX2-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
		; AVX2-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
		; AVX2-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
		; AVX2-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
		; AVX2-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
		; AVX2-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
		; AVX2-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
		; AVX2-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
		; AVX2-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
		; AVX2-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
		; AVX2-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
		; AVX2-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
		; AVX2-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
		; AVX2-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
		; AVX2-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
		; AVX2-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
		; AVX2-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
		; AVX2-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
		; AVX2-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
		; AVX2-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
		; AVX2-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
		; AVX2-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
		; AVX2-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
		; AVX2-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
		; AVX2-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
		; AVX2-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
		; AVX2-NEXT: ret void
		;
		; AVX512-LABEL: @sub_v8i64(
		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
		; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
		; AVX512-NEXT: ret void
		;
		; AVX256BW-LABEL: @sub_v8i64(
		; AVX256BW-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
		; AVX256BW-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
		; AVX256BW-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
		; AVX256BW-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
		; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
		; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
		; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
		; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
		; AVX256BW-NEXT: ret void
;		;
%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8		%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
Show All 21 Lines	;
store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @sub_v16i32() {		define void @sub_v16i32() {
; CHECK-LABEL: @sub_v16i32(		; SSE-LABEL: @sub_v16i32(
; CHECK-NEXT: [[A0:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4		; SSE-NEXT: [[A0:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
; CHECK-NEXT: [[A1:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4		; SSE-NEXT: [[A1:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
; CHECK-NEXT: [[A2:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4		; SSE-NEXT: [[A2:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
; CHECK-NEXT: [[A3:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4		; SSE-NEXT: [[A3:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
; CHECK-NEXT: [[A4:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4		; SSE-NEXT: [[A4:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
; CHECK-NEXT: [[A5:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4		; SSE-NEXT: [[A5:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
; CHECK-NEXT: [[A6:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4		; SSE-NEXT: [[A6:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
; CHECK-NEXT: [[A7:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4		; SSE-NEXT: [[A7:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
; CHECK-NEXT: [[A8:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4		; SSE-NEXT: [[A8:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
; CHECK-NEXT: [[A9:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4		; SSE-NEXT: [[A9:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
; CHECK-NEXT: [[A10:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4		; SSE-NEXT: [[A10:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
; CHECK-NEXT: [[A11:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4		; SSE-NEXT: [[A11:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
; CHECK-NEXT: [[A12:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4		; SSE-NEXT: [[A12:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
; CHECK-NEXT: [[A13:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4		; SSE-NEXT: [[A13:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
; CHECK-NEXT: [[A14:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4		; SSE-NEXT: [[A14:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
; CHECK-NEXT: [[A15:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4		; SSE-NEXT: [[A15:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
; CHECK-NEXT: [[B0:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4		; SSE-NEXT: [[B0:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
; CHECK-NEXT: [[B1:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4		; SSE-NEXT: [[B1:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
; CHECK-NEXT: [[B2:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4		; SSE-NEXT: [[B2:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
; CHECK-NEXT: [[B3:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4		; SSE-NEXT: [[B3:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
; CHECK-NEXT: [[B4:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4		; SSE-NEXT: [[B4:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
; CHECK-NEXT: [[B5:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4		; SSE-NEXT: [[B5:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
; CHECK-NEXT: [[B6:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4		; SSE-NEXT: [[B6:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
; CHECK-NEXT: [[B7:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4		; SSE-NEXT: [[B7:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
; CHECK-NEXT: [[B8:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4		; SSE-NEXT: [[B8:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
; CHECK-NEXT: [[B9:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4		; SSE-NEXT: [[B9:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
; CHECK-NEXT: [[B10:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4		; SSE-NEXT: [[B10:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
; CHECK-NEXT: [[B11:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4		; SSE-NEXT: [[B11:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
; CHECK-NEXT: [[B12:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4		; SSE-NEXT: [[B12:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
; CHECK-NEXT: [[B13:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4		; SSE-NEXT: [[B13:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
; CHECK-NEXT: [[B14:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4		; SSE-NEXT: [[B14:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
; CHECK-NEXT: [[B15:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4		; SSE-NEXT: [[B15:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A0]], i32 [[B0]])		; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A0]], i32 [[B0]])
; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A1]], i32 [[B1]])		; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A1]], i32 [[B1]])
; CHECK-NEXT: [[R2:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A2]], i32 [[B2]])		; SSE-NEXT: [[R2:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A2]], i32 [[B2]])
; CHECK-NEXT: [[R3:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A3]], i32 [[B3]])		; SSE-NEXT: [[R3:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A3]], i32 [[B3]])
; CHECK-NEXT: [[R4:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A4]], i32 [[B4]])		; SSE-NEXT: [[R4:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A4]], i32 [[B4]])
; CHECK-NEXT: [[R5:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A5]], i32 [[B5]])		; SSE-NEXT: [[R5:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A5]], i32 [[B5]])
; CHECK-NEXT: [[R6:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A6]], i32 [[B6]])		; SSE-NEXT: [[R6:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A6]], i32 [[B6]])
; CHECK-NEXT: [[R7:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A7]], i32 [[B7]])		; SSE-NEXT: [[R7:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A7]], i32 [[B7]])
; CHECK-NEXT: [[R8:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A8]], i32 [[B8]])		; SSE-NEXT: [[R8:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A8]], i32 [[B8]])
; CHECK-NEXT: [[R9:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A9]], i32 [[B9]])		; SSE-NEXT: [[R9:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A9]], i32 [[B9]])
; CHECK-NEXT: [[R10:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A10]], i32 [[B10]])		; SSE-NEXT: [[R10:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A10]], i32 [[B10]])
; CHECK-NEXT: [[R11:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A11]], i32 [[B11]])		; SSE-NEXT: [[R11:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A11]], i32 [[B11]])
; CHECK-NEXT: [[R12:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A12]], i32 [[B12]])		; SSE-NEXT: [[R12:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A12]], i32 [[B12]])
; CHECK-NEXT: [[R13:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A13]], i32 [[B13]])		; SSE-NEXT: [[R13:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A13]], i32 [[B13]])
; CHECK-NEXT: [[R14:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A14]], i32 [[B14]])		; SSE-NEXT: [[R14:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A14]], i32 [[B14]])
; CHECK-NEXT: [[R15:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A15]], i32 [[B15]])		; SSE-NEXT: [[R15:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A15]], i32 [[B15]])
; CHECK-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4		; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
; CHECK-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4		; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
; CHECK-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4		; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
; CHECK-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4		; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
; CHECK-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4		; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
; CHECK-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4		; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
; CHECK-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4		; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
; CHECK-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4		; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
; CHECK-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4		; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
; CHECK-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4		; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
; CHECK-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4		; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
; CHECK-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4		; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
; CHECK-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
; CHECK-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
; CHECK-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
; CHECK-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
; CHECK-NEXT: ret void		; SSE-NEXT: ret void
		;
		; SLM-LABEL: @sub_v16i32(
		; SLM-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
		; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
		; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
		; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
		; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
		; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
		; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
		; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
		; SLM-NEXT: ret void
		;
		; AVX-LABEL: @sub_v16i32(
		; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
		; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
		; AVX-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
		; AVX-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
		; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
		; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
		; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
		; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
		; AVX-NEXT: ret void
		;
		; AVX512-LABEL: @sub_v16i32(
		; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
		; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
		; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
		; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
		; AVX512-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4		%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 530 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[CodeGen][X86] Expand vector USUBSAT to UMAX+SUB
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 181794

lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

lib/CodeGen/SelectionDAG/TargetLowering.cpp

lib/Target/X86/X86TargetTransformInfo.cpp

test/Analysis/CostModel/X86/arith-usat.ll

test/CodeGen/X86/usub_sat.ll

test/CodeGen/X86/usub_sat_vec.ll

test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll

This is an archive of the discontinued LLVM Phabricator instance.

[CodeGen][X86] Expand vector USUBSAT to UMAX+SUBClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 181794

lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

lib/CodeGen/SelectionDAG/TargetLowering.cpp

lib/Target/X86/X86TargetTransformInfo.cpp

test/Analysis/CostModel/X86/arith-usat.ll

test/CodeGen/X86/usub_sat.ll

test/CodeGen/X86/usub_sat_vec.ll

test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll

[CodeGen][X86] Expand vector USUBSAT to UMAX+SUB
ClosedPublic