Diff 181820

llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

Show First 20 Lines • Show All 135 Lines • ▼ Show 20 Lines	class VectorLegalizer {
SDValue ExpandFSUB(SDValue Op);		SDValue ExpandFSUB(SDValue Op);
SDValue ExpandBITREVERSE(SDValue Op);		SDValue ExpandBITREVERSE(SDValue Op);
SDValue ExpandCTPOP(SDValue Op);		SDValue ExpandCTPOP(SDValue Op);
SDValue ExpandCTLZ(SDValue Op);		SDValue ExpandCTLZ(SDValue Op);
SDValue ExpandCTTZ(SDValue Op);		SDValue ExpandCTTZ(SDValue Op);
SDValue ExpandFunnelShift(SDValue Op);		SDValue ExpandFunnelShift(SDValue Op);
SDValue ExpandROT(SDValue Op);		SDValue ExpandROT(SDValue Op);
SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);		SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
		SDValue ExpandAddSubSat(SDValue Op);
SDValue ExpandStrictFPOp(SDValue Op);		SDValue ExpandStrictFPOp(SDValue Op);

/// Implements vector promotion.		/// Implements vector promotion.
///		///
/// This is essentially just bitcasting the operands to a different type and		/// This is essentially just bitcasting the operands to a different type and
/// bitcasting the result back to the original type.		/// bitcasting the result back to the original type.
SDValue Promote(SDValue Op);		SDValue Promote(SDValue Op);

▲ Show 20 Lines • Show All 620 Lines • ▼ Show 20 Lines	SDValue VectorLegalizer::Expand(SDValue Op) {
case ISD::FSHR:		case ISD::FSHR:
return ExpandFunnelShift(Op);		return ExpandFunnelShift(Op);
case ISD::ROTL:		case ISD::ROTL:
case ISD::ROTR:		case ISD::ROTR:
return ExpandROT(Op);		return ExpandROT(Op);
case ISD::FMINNUM:		case ISD::FMINNUM:
case ISD::FMAXNUM:		case ISD::FMAXNUM:
return ExpandFMINNUM_FMAXNUM(Op);		return ExpandFMINNUM_FMAXNUM(Op);
		case ISD::USUBSAT:
		case ISD::SSUBSAT:
		case ISD::UADDSAT:
		case ISD::SADDSAT:
		return ExpandAddSubSat(Op);
case ISD::STRICT_FADD:		case ISD::STRICT_FADD:
case ISD::STRICT_FSUB:		case ISD::STRICT_FSUB:
case ISD::STRICT_FMUL:		case ISD::STRICT_FMUL:
case ISD::STRICT_FDIV:		case ISD::STRICT_FDIV:
case ISD::STRICT_FREM:		case ISD::STRICT_FREM:
case ISD::STRICT_FSQRT:		case ISD::STRICT_FSQRT:
case ISD::STRICT_FMA:		case ISD::STRICT_FMA:
case ISD::STRICT_FPOW:		case ISD::STRICT_FPOW:
▲ Show 20 Lines • Show All 413 Lines • ▼ Show 20 Lines
}		}

SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {		SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {
if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG))		if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG))
return Expanded;		return Expanded;
return DAG.UnrollVectorOp(Op.getNode());		return DAG.UnrollVectorOp(Op.getNode());
}		}

		SDValue VectorLegalizer::ExpandAddSubSat(SDValue Op) {
		if (SDValue Expanded = TLI.expandAddSubSat(Op.getNode(), DAG))
		return Expanded;
		return DAG.UnrollVectorOp(Op.getNode());
		}

SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) {		SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) {
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
EVT EltVT = VT.getVectorElementType();		EVT EltVT = VT.getVectorElementType();
unsigned NumElems = VT.getVectorNumElements();		unsigned NumElems = VT.getVectorNumElements();
unsigned NumOpers = Op.getNumOperands();		unsigned NumOpers = Op.getNumOperands();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT ValueVTs[] = {EltVT, MVT::Other};		EVT ValueVTs[] = {EltVT, MVT::Other};
SDValue Chain = Op.getOperand(0);		SDValue Chain = Op.getOperand(0);
▲ Show 20 Lines • Show All 69 Lines • Show Last 20 Lines

llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Show First 20 Lines • Show All 5,271 Lines • ▼ Show 20 Lines	if (C->isNullValue() && CC == ISD::SETEQ) {
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);		return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);
}		}
}		}
return SDValue();		return SDValue();
}		}

SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {		SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
unsigned Opcode = Node->getOpcode();		unsigned Opcode = Node->getOpcode();
		SDValue LHS = Node->getOperand(0);
		SDValue RHS = Node->getOperand(1);
		EVT VT = LHS.getValueType();
		SDLoc dl(Node);

		// usub.sat(a, b) -> umax(a, b) - b
		if (Opcode == ISD::USUBSAT && isOperationLegalOrCustom(ISD::UMAX, VT)) {
		SDValue Max = DAG.getNode(ISD::UMAX, dl, VT, LHS, RHS);
		return DAG.getNode(ISD::SUB, dl, VT, Max, RHS);
		}

		if (VT.isVector()) {
		// TODO: Consider not scalarizing here.
		return SDValue();
		}

unsigned OverflowOp;		unsigned OverflowOp;
switch (Opcode) {		switch (Opcode) {
case ISD::SADDSAT:		case ISD::SADDSAT:
OverflowOp = ISD::SADDO;		OverflowOp = ISD::SADDO;
break;		break;
case ISD::UADDSAT:		case ISD::UADDSAT:
OverflowOp = ISD::UADDO;		OverflowOp = ISD::UADDO;
break;		break;
case ISD::SSUBSAT:		case ISD::SSUBSAT:
OverflowOp = ISD::SSUBO;		OverflowOp = ISD::SSUBO;
break;		break;
case ISD::USUBSAT:		case ISD::USUBSAT:
OverflowOp = ISD::USUBO;		OverflowOp = ISD::USUBO;
break;		break;
default:		default:
llvm_unreachable("Expected method to receive signed or unsigned saturation "		llvm_unreachable("Expected method to receive signed or unsigned saturation "
"addition or subtraction node.");		"addition or subtraction node.");
}		}
assert(Node->getNumOperands() == 2 && "Expected node to have 2 operands.");

SDLoc dl(Node);
SDValue LHS = Node->getOperand(0);
SDValue RHS = Node->getOperand(1);
assert(LHS.getValueType().isScalarInteger() &&		assert(LHS.getValueType().isScalarInteger() &&
"Expected operands to be integers. Vector of int arguments should "		"Expected operands to be integers. Vector of int arguments should "
"already be unrolled.");		"already be unrolled.");
assert(RHS.getValueType().isScalarInteger() &&		assert(RHS.getValueType().isScalarInteger() &&
"Expected operands to be integers. Vector of int arguments should "		"Expected operands to be integers. Vector of int arguments should "
"already be unrolled.");		"already be unrolled.");
assert(LHS.getValueType() == RHS.getValueType() &&		assert(LHS.getValueType() == RHS.getValueType() &&
"Expected both operands to be the same type");		"Expected both operands to be the same type");
▲ Show 20 Lines • Show All 85 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp

Show First 20 Lines • Show All 1,774 Lines • ▼ Show 20 Lines	static const CostTblEntry AVX512CostTbl[] = {
{ ISD::BITREVERSE, MVT::v8i64, 36 },		{ ISD::BITREVERSE, MVT::v8i64, 36 },
{ ISD::BITREVERSE, MVT::v16i32, 24 },		{ ISD::BITREVERSE, MVT::v16i32, 24 },
{ ISD::CTLZ, MVT::v8i64, 29 },		{ ISD::CTLZ, MVT::v8i64, 29 },
{ ISD::CTLZ, MVT::v16i32, 35 },		{ ISD::CTLZ, MVT::v16i32, 35 },
{ ISD::CTPOP, MVT::v8i64, 16 },		{ ISD::CTPOP, MVT::v8i64, 16 },
{ ISD::CTPOP, MVT::v16i32, 24 },		{ ISD::CTPOP, MVT::v16i32, 24 },
{ ISD::CTTZ, MVT::v8i64, 20 },		{ ISD::CTTZ, MVT::v8i64, 20 },
{ ISD::CTTZ, MVT::v16i32, 28 },		{ ISD::CTTZ, MVT::v16i32, 28 },
		{ ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
		{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
		{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
		{ ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq
};		};
static const CostTblEntry XOPCostTbl[] = {		static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },		{ ISD::BITREVERSE, MVT::v4i64, 4 },
{ ISD::BITREVERSE, MVT::v8i32, 4 },		{ ISD::BITREVERSE, MVT::v8i32, 4 },
{ ISD::BITREVERSE, MVT::v16i16, 4 },		{ ISD::BITREVERSE, MVT::v16i16, 4 },
{ ISD::BITREVERSE, MVT::v32i8, 4 },		{ ISD::BITREVERSE, MVT::v32i8, 4 },
{ ISD::BITREVERSE, MVT::v2i64, 1 },		{ ISD::BITREVERSE, MVT::v2i64, 1 },
{ ISD::BITREVERSE, MVT::v4i32, 1 },		{ ISD::BITREVERSE, MVT::v4i32, 1 },
Show All 27 Lines	static const CostTblEntry AVX2CostTbl[] = {
{ ISD::SADDSAT, MVT::v16i16, 1 },		{ ISD::SADDSAT, MVT::v16i16, 1 },
{ ISD::SADDSAT, MVT::v32i8, 1 },		{ ISD::SADDSAT, MVT::v32i8, 1 },
{ ISD::SSUBSAT, MVT::v16i16, 1 },		{ ISD::SSUBSAT, MVT::v16i16, 1 },
{ ISD::SSUBSAT, MVT::v32i8, 1 },		{ ISD::SSUBSAT, MVT::v32i8, 1 },
{ ISD::UADDSAT, MVT::v16i16, 1 },		{ ISD::UADDSAT, MVT::v16i16, 1 },
{ ISD::UADDSAT, MVT::v32i8, 1 },		{ ISD::UADDSAT, MVT::v32i8, 1 },
{ ISD::USUBSAT, MVT::v16i16, 1 },		{ ISD::USUBSAT, MVT::v16i16, 1 },
{ ISD::USUBSAT, MVT::v32i8, 1 },		{ ISD::USUBSAT, MVT::v32i8, 1 },
		{ ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
{ ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/		{ ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
};		};
static const CostTblEntry AVX1CostTbl[] = {		static const CostTblEntry AVX1CostTbl[] = {
Show All 19 Lines	static const CostTblEntry AVX1CostTbl[] = {
{ ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert		{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
		{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/		{ ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
};		};
static const CostTblEntry GLMCostTbl[] = {		static const CostTblEntry GLMCostTbl[] = {
{ ISD::FSQRT, MVT::f32, 19 }, // sqrtss		{ ISD::FSQRT, MVT::f32, 19 }, // sqrtss
{ ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps		{ ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
{ ISD::FSQRT, MVT::f64, 34 }, // sqrtsd		{ ISD::FSQRT, MVT::f64, 34 }, // sqrtsd
{ ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd		{ ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
};		};
static const CostTblEntry SLMCostTbl[] = {		static const CostTblEntry SLMCostTbl[] = {
{ ISD::FSQRT, MVT::f32, 20 }, // sqrtss		{ ISD::FSQRT, MVT::f32, 20 }, // sqrtss
{ ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps		{ ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
{ ISD::FSQRT, MVT::f64, 35 }, // sqrtsd		{ ISD::FSQRT, MVT::f64, 35 }, // sqrtsd
{ ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd		{ ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
};		};
static const CostTblEntry SSE42CostTbl[] = {		static const CostTblEntry SSE42CostTbl[] = {
		{ ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/		{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/		{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
};		};
static const CostTblEntry SSSE3CostTbl[] = {		static const CostTblEntry SSSE3CostTbl[] = {
{ ISD::BITREVERSE, MVT::v2i64, 5 },		{ ISD::BITREVERSE, MVT::v2i64, 5 },
{ ISD::BITREVERSE, MVT::v4i32, 5 },		{ ISD::BITREVERSE, MVT::v4i32, 5 },
{ ISD::BITREVERSE, MVT::v8i16, 5 },		{ ISD::BITREVERSE, MVT::v8i16, 5 },
{ ISD::BITREVERSE, MVT::v16i8, 5 },		{ ISD::BITREVERSE, MVT::v16i8, 5 },
▲ Show 20 Lines • Show All 1,374 Lines • Show Last 20 Lines

llvm/trunk/test/Analysis/CostModel/X86/arith-usat.ll

	Show First 20 Lines • Show All 244 Lines • ▼ Show 20 Lines
	declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>)			declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>)

	declare i8 @llvm.usub.sat.i8(i8, i8)			declare i8 @llvm.usub.sat.i8(i8, i8)
	declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>)			declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>)
	declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)			declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
	declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)			declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)

	define i32 @sub(i32 %arg) {			define i32 @sub(i32 %arg) {
	; SSE-LABEL: 'sub'			; SSSE3-LABEL: 'sub'
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
				;
				; SSE42-LABEL: 'sub'
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
				; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX1-LABEL: 'sub'			; AVX1-LABEL: 'sub'
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX2-LABEL: 'sub'			; AVX2-LABEL: 'sub'
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX512F-LABEL: 'sub'			; AVX512F-LABEL: 'sub'
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX512BW-LABEL: 'sub'			; AVX512BW-LABEL: 'sub'
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; AVX512DQ-LABEL: 'sub'			; AVX512DQ-LABEL: 'sub'
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; SLM-LABEL: 'sub'			; SLM-LABEL: 'sub'
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; GLM-LABEL: 'sub'			; GLM-LABEL: 'sub'
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef			; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
	;			;
	; BTVER2-LABEL: 'sub'			; BTVER2-LABEL: 'sub'
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
	; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)			; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
	Show All 24 Lines

llvm/trunk/test/CodeGen/X86/usub_sat.ll

	Show First 20 Lines • Show All 106 Lines • ▼ Show 20 Lines
	; X86-NEXT: movl %edi, (%eax)			; X86-NEXT: movl %edi, (%eax)
	; X86-NEXT: popl %esi			; X86-NEXT: popl %esi
	; X86-NEXT: popl %edi			; X86-NEXT: popl %edi
	; X86-NEXT: popl %ebx			; X86-NEXT: popl %ebx
	; X86-NEXT: retl $4			; X86-NEXT: retl $4
	;			;
	; X64-LABEL: vec:			; X64-LABEL: vec:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]			; X64-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
	; X64-NEXT: movd %xmm2, %eax			; X64-NEXT: movdqa %xmm1, %xmm3
	; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]			; X64-NEXT: pxor %xmm2, %xmm3
	; X64-NEXT: movd %xmm2, %ecx			; X64-NEXT: pxor %xmm0, %xmm2
	; X64-NEXT: xorl %edx, %edx			; X64-NEXT: pcmpgtd %xmm3, %xmm2
	; X64-NEXT: subl %eax, %ecx			; X64-NEXT: pand %xmm2, %xmm0
	; X64-NEXT: cmovbl %edx, %ecx			; X64-NEXT: pandn %xmm1, %xmm2
	; X64-NEXT: movd %ecx, %xmm2			; X64-NEXT: por %xmm2, %xmm0
	; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]			; X64-NEXT: psubd %xmm1, %xmm0
	; X64-NEXT: movd %xmm3, %eax
	; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
	; X64-NEXT: movd %xmm3, %ecx
	; X64-NEXT: subl %eax, %ecx
	; X64-NEXT: cmovbl %edx, %ecx
	; X64-NEXT: movd %ecx, %xmm3
	; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
	; X64-NEXT: movd %xmm1, %eax
	; X64-NEXT: movd %xmm0, %ecx
	; X64-NEXT: subl %eax, %ecx
	; X64-NEXT: cmovbl %edx, %ecx
	; X64-NEXT: movd %ecx, %xmm2
	; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
	; X64-NEXT: movd %xmm1, %eax
	; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; X64-NEXT: movd %xmm0, %ecx
	; X64-NEXT: subl %eax, %ecx
	; X64-NEXT: cmovbl %edx, %ecx
	; X64-NEXT: movd %ecx, %xmm0
	; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
	; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
	; X64-NEXT: movdqa %xmm2, %xmm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%tmp = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);			%tmp = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
	ret <4 x i32> %tmp;			ret <4 x i32> %tmp;
	}			}

llvm/trunk/test/CodeGen/X86/usub_sat_vec.ll

	Show First 20 Lines • Show All 628 Lines • ▼ Show 20 Lines
	}			}

	; Expanded			; Expanded

	define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {			define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
	; SSE2-LABEL: v2i32:			; SSE2-LABEL: v2i32:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: psllq $32, %xmm1			; SSE2-NEXT: psllq $32, %xmm1
	; SSE2-NEXT: movq %xmm1, %rax			; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
				; SSE2-NEXT: movdqa %xmm1, %xmm3
				; SSE2-NEXT: pxor %xmm2, %xmm3
	; SSE2-NEXT: psllq $32, %xmm0			; SSE2-NEXT: psllq $32, %xmm0
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: pxor %xmm0, %xmm2
	; SSE2-NEXT: xorl %edx, %edx			; SSE2-NEXT: movdqa %xmm2, %xmm4
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSE2-NEXT: movq %rcx, %xmm2			; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; SSE2-NEXT: movq %xmm1, %rax			; SSE2-NEXT: pand %xmm5, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: por %xmm2, %xmm3
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pand %xmm3, %xmm0
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: pandn %xmm1, %xmm3
	; SSE2-NEXT: movq %rcx, %xmm0			; SSE2-NEXT: por %xmm3, %xmm0
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSE2-NEXT: psubq %xmm1, %xmm0
	; SSE2-NEXT: psrlq $32, %xmm2			; SSE2-NEXT: psrlq $32, %xmm0
	; SSE2-NEXT: movdqa %xmm2, %xmm0
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v2i32:			; SSSE3-LABEL: v2i32:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: psllq $32, %xmm1			; SSSE3-NEXT: psllq $32, %xmm1
	; SSSE3-NEXT: movq %xmm1, %rax			; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
				; SSSE3-NEXT: movdqa %xmm1, %xmm3
				; SSSE3-NEXT: pxor %xmm2, %xmm3
	; SSSE3-NEXT: psllq $32, %xmm0			; SSSE3-NEXT: psllq $32, %xmm0
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: pxor %xmm0, %xmm2
	; SSSE3-NEXT: xorl %edx, %edx			; SSSE3-NEXT: movdqa %xmm2, %xmm4
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSSE3-NEXT: movq %rcx, %xmm2			; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; SSSE3-NEXT: movq %xmm1, %rax			; SSSE3-NEXT: pand %xmm5, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: por %xmm2, %xmm3
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pand %xmm3, %xmm0
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: pandn %xmm1, %xmm3
	; SSSE3-NEXT: movq %rcx, %xmm0			; SSSE3-NEXT: por %xmm3, %xmm0
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSSE3-NEXT: psubq %xmm1, %xmm0
	; SSSE3-NEXT: psrlq $32, %xmm2			; SSSE3-NEXT: psrlq $32, %xmm0
	; SSSE3-NEXT: movdqa %xmm2, %xmm0
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v2i32:			; SSE41-LABEL: v2i32:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
				; SSE41-NEXT: movdqa %xmm0, %xmm2
	; SSE41-NEXT: psllq $32, %xmm1			; SSE41-NEXT: psllq $32, %xmm1
	; SSE41-NEXT: pextrq $1, %xmm1, %rax			; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
	; SSE41-NEXT: psllq $32, %xmm0			; SSE41-NEXT: movdqa %xmm1, %xmm3
	; SSE41-NEXT: pextrq $1, %xmm0, %rcx			; SSE41-NEXT: pxor %xmm0, %xmm3
	; SSE41-NEXT: xorl %edx, %edx			; SSE41-NEXT: psllq $32, %xmm2
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pxor %xmm2, %xmm0
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: movdqa %xmm0, %xmm4
	; SSE41-NEXT: movq %rcx, %xmm2			; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
	; SSE41-NEXT: movq %xmm1, %rax			; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSE41-NEXT: movq %xmm0, %rcx			; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: pand %xmm5, %xmm0
	; SSE41-NEXT: movq %rcx, %xmm0			; SSE41-NEXT: por %xmm4, %xmm0
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]			; SSE41-NEXT: movdqa %xmm1, %xmm3
	; SSE41-NEXT: psrlq $32, %xmm0			; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
				; SSE41-NEXT: psubq %xmm1, %xmm3
				; SSE41-NEXT: psrlq $32, %xmm3
				; SSE41-NEXT: movdqa %xmm3, %xmm0
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: v2i32:			; AVX1-LABEL: v2i32:
	; AVX: # %bb.0:			; AVX1: # %bb.0:
	; AVX-NEXT: vpsllq $32, %xmm1, %xmm1			; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
	; AVX-NEXT: vpextrq $1, %xmm1, %rax			; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
	; AVX-NEXT: vpsllq $32, %xmm0, %xmm0			; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
	; AVX-NEXT: vpextrq $1, %xmm0, %rcx			; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
	; AVX-NEXT: xorl %edx, %edx			; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
	; AVX-NEXT: subq %rax, %rcx			; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
	; AVX-NEXT: cmovbq %rdx, %rcx			; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
	; AVX-NEXT: vmovq %rcx, %xmm2			; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vmovq %xmm1, %rax			; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
	; AVX-NEXT: vmovq %xmm0, %rcx			; AVX1-NEXT: retq
	; AVX-NEXT: subq %rax, %rcx			;
	; AVX-NEXT: cmovbq %rdx, %rcx			; AVX2-LABEL: v2i32:
	; AVX-NEXT: vmovq %rcx, %xmm0			; AVX2: # %bb.0:
	; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]			; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
	; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0			; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
	; AVX-NEXT: retq			; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
				; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
				; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
				; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
				; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
				; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
				; AVX2-NEXT: retq
				;
				; AVX512-LABEL: v2i32:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
				; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
				; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
				; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
				; AVX512-NEXT: vpsrlq $32, %xmm0, %xmm0
				; AVX512-NEXT: retq
	%z = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)			%z = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
	ret <2 x i32> %z			ret <2 x i32> %z
	}			}

	define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {			define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
	; SSE2-LABEL: v4i32:			; SSE2-LABEL: v4i32:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]			; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
	; SSE2-NEXT: movd %xmm2, %eax			; SSE2-NEXT: movdqa %xmm1, %xmm3
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]			; SSE2-NEXT: pxor %xmm2, %xmm3
	; SSE2-NEXT: movd %xmm2, %ecx			; SSE2-NEXT: pxor %xmm0, %xmm2
	; SSE2-NEXT: xorl %edx, %edx			; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
	; SSE2-NEXT: subl %eax, %ecx			; SSE2-NEXT: pand %xmm2, %xmm0
	; SSE2-NEXT: cmovbl %edx, %ecx			; SSE2-NEXT: pandn %xmm1, %xmm2
	; SSE2-NEXT: movd %ecx, %xmm2			; SSE2-NEXT: por %xmm2, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]			; SSE2-NEXT: psubd %xmm1, %xmm0
	; SSE2-NEXT: movd %xmm3, %eax
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
	; SSE2-NEXT: movd %xmm3, %ecx
	; SSE2-NEXT: subl %eax, %ecx
	; SSE2-NEXT: cmovbl %edx, %ecx
	; SSE2-NEXT: movd %ecx, %xmm3
	; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
	; SSE2-NEXT: movd %xmm1, %eax
	; SSE2-NEXT: movd %xmm0, %ecx
	; SSE2-NEXT: subl %eax, %ecx
	; SSE2-NEXT: cmovbl %edx, %ecx
	; SSE2-NEXT: movd %ecx, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
	; SSE2-NEXT: movd %xmm1, %eax
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; SSE2-NEXT: movd %xmm0, %ecx
	; SSE2-NEXT: subl %eax, %ecx
	; SSE2-NEXT: cmovbl %edx, %ecx
	; SSE2-NEXT: movd %ecx, %xmm0
	; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
	; SSE2-NEXT: movdqa %xmm2, %xmm0
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v4i32:			; SSSE3-LABEL: v4i32:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]			; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
	; SSSE3-NEXT: movd %xmm2, %eax			; SSSE3-NEXT: movdqa %xmm1, %xmm3
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]			; SSSE3-NEXT: pxor %xmm2, %xmm3
	; SSSE3-NEXT: movd %xmm2, %ecx			; SSSE3-NEXT: pxor %xmm0, %xmm2
	; SSSE3-NEXT: xorl %edx, %edx			; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
	; SSSE3-NEXT: subl %eax, %ecx			; SSSE3-NEXT: pand %xmm2, %xmm0
	; SSSE3-NEXT: cmovbl %edx, %ecx			; SSSE3-NEXT: pandn %xmm1, %xmm2
	; SSSE3-NEXT: movd %ecx, %xmm2			; SSSE3-NEXT: por %xmm2, %xmm0
	; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]			; SSSE3-NEXT: psubd %xmm1, %xmm0
	; SSSE3-NEXT: movd %xmm3, %eax
	; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
	; SSSE3-NEXT: movd %xmm3, %ecx
	; SSSE3-NEXT: subl %eax, %ecx
	; SSSE3-NEXT: cmovbl %edx, %ecx
	; SSSE3-NEXT: movd %ecx, %xmm3
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
	; SSSE3-NEXT: movd %xmm1, %eax
	; SSSE3-NEXT: movd %xmm0, %ecx
	; SSSE3-NEXT: subl %eax, %ecx
	; SSSE3-NEXT: cmovbl %edx, %ecx
	; SSSE3-NEXT: movd %ecx, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
	; SSSE3-NEXT: movd %xmm1, %eax
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; SSSE3-NEXT: movd %xmm0, %ecx
	; SSSE3-NEXT: subl %eax, %ecx
	; SSSE3-NEXT: cmovbl %edx, %ecx
	; SSSE3-NEXT: movd %ecx, %xmm0
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
	; SSSE3-NEXT: movdqa %xmm2, %xmm0
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v4i32:			; SSE41-LABEL: v4i32:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: pextrd $1, %xmm1, %eax			; SSE41-NEXT: pmaxud %xmm1, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm0, %ecx			; SSE41-NEXT: psubd %xmm1, %xmm0
	; SSE41-NEXT: xorl %edx, %edx
	; SSE41-NEXT: subl %eax, %ecx
	; SSE41-NEXT: cmovbl %edx, %ecx
	; SSE41-NEXT: movd %xmm1, %eax
	; SSE41-NEXT: movd %xmm0, %esi
	; SSE41-NEXT: subl %eax, %esi
	; SSE41-NEXT: cmovbl %edx, %esi
	; SSE41-NEXT: movd %esi, %xmm2
	; SSE41-NEXT: pinsrd $1, %ecx, %xmm2
	; SSE41-NEXT: pextrd $2, %xmm1, %eax
	; SSE41-NEXT: pextrd $2, %xmm0, %ecx
	; SSE41-NEXT: subl %eax, %ecx
	; SSE41-NEXT: cmovbl %edx, %ecx
	; SSE41-NEXT: pinsrd $2, %ecx, %xmm2
	; SSE41-NEXT: pextrd $3, %xmm1, %eax
	; SSE41-NEXT: pextrd $3, %xmm0, %ecx
	; SSE41-NEXT: subl %eax, %ecx
	; SSE41-NEXT: cmovbl %edx, %ecx
	; SSE41-NEXT: pinsrd $3, %ecx, %xmm2
	; SSE41-NEXT: movdqa %xmm2, %xmm0
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: v4i32:			; AVX-LABEL: v4i32:
	; AVX: # %bb.0:			; AVX: # %bb.0:
	; AVX-NEXT: vpextrd $1, %xmm1, %eax			; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vpextrd $1, %xmm0, %ecx			; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
	; AVX-NEXT: xorl %edx, %edx
	; AVX-NEXT: subl %eax, %ecx
	; AVX-NEXT: cmovbl %edx, %ecx
	; AVX-NEXT: vmovd %xmm1, %eax
	; AVX-NEXT: vmovd %xmm0, %esi
	; AVX-NEXT: subl %eax, %esi
	; AVX-NEXT: cmovbl %edx, %esi
	; AVX-NEXT: vmovd %esi, %xmm2
	; AVX-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
	; AVX-NEXT: vpextrd $2, %xmm1, %eax
	; AVX-NEXT: vpextrd $2, %xmm0, %ecx
	; AVX-NEXT: subl %eax, %ecx
	; AVX-NEXT: cmovbl %edx, %ecx
	; AVX-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
	; AVX-NEXT: vpextrd $3, %xmm1, %eax
	; AVX-NEXT: vpextrd $3, %xmm0, %ecx
	; AVX-NEXT: subl %eax, %ecx
	; AVX-NEXT: cmovbl %edx, %ecx
	; AVX-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%z = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)			%z = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
	ret <4 x i32> %z			ret <4 x i32> %z
	}			}

	define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {			define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
	; SSE2-LABEL: v8i32:			; SSE2-LABEL: v8i32:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
				; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
				; SSE2-NEXT: movdqa %xmm2, %xmm6
				; SSE2-NEXT: pxor %xmm5, %xmm6
	; SSE2-NEXT: movdqa %xmm0, %xmm4			; SSE2-NEXT: movdqa %xmm0, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]			; SSE2-NEXT: pxor %xmm5, %xmm4
	; SSE2-NEXT: movd %xmm0, %ecx			; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,3]			; SSE2-NEXT: pand %xmm4, %xmm0
	; SSE2-NEXT: movd %xmm0, %edx			; SSE2-NEXT: pandn %xmm2, %xmm4
	; SSE2-NEXT: xorl %eax, %eax			; SSE2-NEXT: por %xmm0, %xmm4
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: psubd %xmm2, %xmm4
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: movdqa %xmm3, %xmm0
	; SSE2-NEXT: movd %edx, %xmm0			; SSE2-NEXT: pxor %xmm5, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]			; SSE2-NEXT: pxor %xmm1, %xmm5
	; SSE2-NEXT: movd %xmm5, %ecx			; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]			; SSE2-NEXT: pand %xmm5, %xmm1
	; SSE2-NEXT: movd %xmm5, %edx			; SSE2-NEXT: pandn %xmm3, %xmm5
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: por %xmm5, %xmm1
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: psubd %xmm3, %xmm1
	; SSE2-NEXT: movd %edx, %xmm5			; SSE2-NEXT: movdqa %xmm4, %xmm0
	; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
	; SSE2-NEXT: movd %xmm2, %ecx
	; SSE2-NEXT: movd %xmm4, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
	; SSE2-NEXT: movd %xmm2, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,3]
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
	; SSE2-NEXT: movd %xmm2, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
	; SSE2-NEXT: movd %xmm4, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
	; SSE2-NEXT: movd %xmm4, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm4
	; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
	; SSE2-NEXT: movd %xmm3, %ecx
	; SSE2-NEXT: movd %xmm1, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
	; SSE2-NEXT: movd %xmm3, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
	; SSE2-NEXT: movd %xmm1, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm1
	; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; SSE2-NEXT: movdqa %xmm2, %xmm1
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v8i32:			; SSSE3-LABEL: v8i32:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
				; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
				; SSSE3-NEXT: movdqa %xmm2, %xmm6
				; SSSE3-NEXT: pxor %xmm5, %xmm6
	; SSSE3-NEXT: movdqa %xmm0, %xmm4			; SSSE3-NEXT: movdqa %xmm0, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]			; SSSE3-NEXT: pxor %xmm5, %xmm4
	; SSSE3-NEXT: movd %xmm0, %ecx			; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,3]			; SSSE3-NEXT: pand %xmm4, %xmm0
	; SSSE3-NEXT: movd %xmm0, %edx			; SSSE3-NEXT: pandn %xmm2, %xmm4
	; SSSE3-NEXT: xorl %eax, %eax			; SSSE3-NEXT: por %xmm0, %xmm4
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: psubd %xmm2, %xmm4
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: movdqa %xmm3, %xmm0
	; SSSE3-NEXT: movd %edx, %xmm0			; SSSE3-NEXT: pxor %xmm5, %xmm0
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]			; SSSE3-NEXT: pxor %xmm1, %xmm5
	; SSSE3-NEXT: movd %xmm5, %ecx			; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]			; SSSE3-NEXT: pand %xmm5, %xmm1
	; SSSE3-NEXT: movd %xmm5, %edx			; SSSE3-NEXT: pandn %xmm3, %xmm5
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: por %xmm5, %xmm1
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: psubd %xmm3, %xmm1
	; SSSE3-NEXT: movd %edx, %xmm5			; SSSE3-NEXT: movdqa %xmm4, %xmm0
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
	; SSSE3-NEXT: movd %xmm2, %ecx
	; SSSE3-NEXT: movd %xmm4, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm0
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
	; SSSE3-NEXT: movd %xmm4, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
	; SSSE3-NEXT: movd %xmm4, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm4
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
	; SSSE3-NEXT: movd %xmm3, %ecx
	; SSSE3-NEXT: movd %xmm1, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
	; SSSE3-NEXT: movd %xmm3, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
	; SSSE3-NEXT: movd %xmm1, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm1
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; SSSE3-NEXT: movdqa %xmm2, %xmm1
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v8i32:			; SSE41-LABEL: v8i32:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: movdqa %xmm0, %xmm4			; SSE41-NEXT: pmaxud %xmm2, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm2, %ecx			; SSE41-NEXT: psubd %xmm2, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm0, %edx			; SSE41-NEXT: pmaxud %xmm3, %xmm1
	; SSE41-NEXT: xorl %eax, %eax			; SSE41-NEXT: psubd %xmm3, %xmm1
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: movd %xmm2, %ecx
	; SSE41-NEXT: movd %xmm0, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm0
	; SSE41-NEXT: pinsrd $1, %edx, %xmm0
	; SSE41-NEXT: pextrd $2, %xmm2, %ecx
	; SSE41-NEXT: pextrd $2, %xmm4, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm0
	; SSE41-NEXT: pextrd $3, %xmm2, %ecx
	; SSE41-NEXT: pextrd $3, %xmm4, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm3, %ecx
	; SSE41-NEXT: pextrd $1, %xmm1, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: movd %xmm3, %ecx
	; SSE41-NEXT: movd %xmm1, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm2
	; SSE41-NEXT: pinsrd $1, %edx, %xmm2
	; SSE41-NEXT: pextrd $2, %xmm3, %ecx
	; SSE41-NEXT: pextrd $2, %xmm1, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm2
	; SSE41-NEXT: pextrd $3, %xmm3, %ecx
	; SSE41-NEXT: pextrd $3, %xmm1, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm2
	; SSE41-NEXT: movdqa %xmm2, %xmm1
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: v8i32:			; AVX1-LABEL: v8i32:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2			; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpextrd $1, %xmm2, %ecx
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpextrd $1, %xmm3, %edx			; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm3
	; AVX1-NEXT: xorl %eax, %eax			; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
	; AVX1-NEXT: subl %ecx, %edx			; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: cmovbl %eax, %edx			; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vmovd %xmm2, %ecx
	; AVX1-NEXT: vmovd %xmm3, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm4
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX1-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm3, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm3, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
	; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
	; AVX1-NEXT: vpextrd $1, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vmovd %xmm1, %ecx
	; AVX1-NEXT: vmovd %xmm0, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm3
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm3, %xmm3
	; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
	; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm3, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: v8i32:			; AVX2-LABEL: v8i32:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2			; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: vpextrd $1, %xmm2, %ecx			; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
	; AVX2-NEXT: vpextrd $1, %xmm3, %edx
	; AVX2-NEXT: xorl %eax, %eax
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm2, %ecx
	; AVX2-NEXT: vmovd %xmm3, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm4
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX2-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm3, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX2-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm3, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
	; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
	; AVX2-NEXT: vpextrd $1, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm1, %ecx
	; AVX2-NEXT: vmovd %xmm0, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm3
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm3, %xmm3
	; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
	; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm3, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: v8i32:			; AVX512-LABEL: v8i32:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2			; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
	; AVX512-NEXT: vpextrd $1, %xmm2, %ecx			; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
	; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
	; AVX512-NEXT: vpextrd $1, %xmm3, %edx
	; AVX512-NEXT: xorl %eax, %eax
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm2, %ecx
	; AVX512-NEXT: vmovd %xmm3, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm4
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm3, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm3, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
	; AVX512-NEXT: vpextrd $1, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $1, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm1, %ecx
	; AVX512-NEXT: vmovd %xmm0, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm3
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm3, %xmm3
	; AVX512-NEXT: vpextrd $2, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm3, %xmm3
	; AVX512-NEXT: vpextrd $3, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm3, %xmm0
	; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%z = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)			%z = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
	ret <8 x i32> %z			ret <8 x i32> %z
	}			}

	define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {			define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
	; SSE2-LABEL: v16i32:			; SSE2-LABEL: v16i32:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: movdqa %xmm1, %xmm8			; SSE2-NEXT: movdqa %xmm1, %xmm8
	; SSE2-NEXT: movdqa %xmm0, %xmm1			; SSE2-NEXT: movdqa %xmm0, %xmm10
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,3]			; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
	; SSE2-NEXT: movd %xmm0, %ecx			; SSE2-NEXT: movdqa %xmm4, %xmm1
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]			; SSE2-NEXT: pxor %xmm9, %xmm1
	; SSE2-NEXT: movd %xmm0, %edx			; SSE2-NEXT: pxor %xmm9, %xmm0
	; SSE2-NEXT: xorl %eax, %eax			; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: pand %xmm0, %xmm10
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: pandn %xmm4, %xmm0
	; SSE2-NEXT: movd %edx, %xmm9			; SSE2-NEXT: por %xmm10, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]			; SSE2-NEXT: psubd %xmm4, %xmm0
	; SSE2-NEXT: movd %xmm0, %ecx			; SSE2-NEXT: movdqa %xmm5, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]			; SSE2-NEXT: pxor %xmm9, %xmm4
	; SSE2-NEXT: movd %xmm0, %edx			; SSE2-NEXT: movdqa %xmm8, %xmm1
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: pxor %xmm9, %xmm1
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
	; SSE2-NEXT: movd %edx, %xmm10			; SSE2-NEXT: pand %xmm1, %xmm8
	; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]			; SSE2-NEXT: pandn %xmm5, %xmm1
	; SSE2-NEXT: movd %xmm4, %ecx			; SSE2-NEXT: por %xmm8, %xmm1
	; SSE2-NEXT: movd %xmm1, %edx			; SSE2-NEXT: psubd %xmm5, %xmm1
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: movdqa %xmm6, %xmm5
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: pxor %xmm9, %xmm5
	; SSE2-NEXT: movd %edx, %xmm0			; SSE2-NEXT: movdqa %xmm2, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]			; SSE2-NEXT: pxor %xmm9, %xmm4
	; SSE2-NEXT: movd %xmm4, %ecx			; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]			; SSE2-NEXT: pand %xmm4, %xmm2
	; SSE2-NEXT: movd %xmm1, %edx			; SSE2-NEXT: pandn %xmm6, %xmm4
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: por %xmm2, %xmm4
	; SSE2-NEXT: cmovbl %eax, %edx			; SSE2-NEXT: psubd %xmm6, %xmm4
	; SSE2-NEXT: movd %edx, %xmm1			; SSE2-NEXT: movdqa %xmm7, %xmm2
	; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; SSE2-NEXT: pxor %xmm9, %xmm2
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]			; SSE2-NEXT: pxor %xmm3, %xmm9
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,3]			; SSE2-NEXT: pcmpgtd %xmm2, %xmm9
	; SSE2-NEXT: movd %xmm1, %ecx			; SSE2-NEXT: pand %xmm9, %xmm3
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3]			; SSE2-NEXT: pandn %xmm7, %xmm9
	; SSE2-NEXT: movd %xmm1, %edx			; SSE2-NEXT: por %xmm9, %xmm3
	; SSE2-NEXT: subl %ecx, %edx			; SSE2-NEXT: psubd %xmm7, %xmm3
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm1
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
	; SSE2-NEXT: movd %xmm4, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1]
	; SSE2-NEXT: movd %xmm4, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm4
	; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
	; SSE2-NEXT: movd %xmm5, %ecx
	; SSE2-NEXT: movd %xmm8, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm1
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
	; SSE2-NEXT: movd %xmm5, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3]
	; SSE2-NEXT: movd %xmm5, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm5
	; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
	; SSE2-NEXT: movd %xmm4, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,1,2,3]
	; SSE2-NEXT: movd %xmm4, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
	; SSE2-NEXT: movd %xmm5, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
	; SSE2-NEXT: movd %xmm5, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm5
	; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
	; SSE2-NEXT: movd %xmm6, %ecx
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
	; SSE2-NEXT: movd %xmm6, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,1,2,3]
	; SSE2-NEXT: movd %xmm2, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
	; SSE2-NEXT: movd %xmm5, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
	; SSE2-NEXT: movd %xmm5, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm6
	; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
	; SSE2-NEXT: movd %xmm7, %ecx
	; SSE2-NEXT: movd %xmm3, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm5
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,2,3]
	; SSE2-NEXT: movd %xmm2, %ecx
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3]
	; SSE2-NEXT: movd %xmm2, %edx
	; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: cmovbl %eax, %edx
	; SSE2-NEXT: movd %edx, %xmm2
	; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
	; SSE2-NEXT: movdqa %xmm4, %xmm2			; SSE2-NEXT: movdqa %xmm4, %xmm2
	; SSE2-NEXT: movdqa %xmm5, %xmm3
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v16i32:			; SSSE3-LABEL: v16i32:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: movdqa %xmm1, %xmm8			; SSSE3-NEXT: movdqa %xmm1, %xmm8
	; SSSE3-NEXT: movdqa %xmm0, %xmm1			; SSSE3-NEXT: movdqa %xmm0, %xmm10
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,3]			; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
	; SSSE3-NEXT: movd %xmm0, %ecx			; SSSE3-NEXT: movdqa %xmm4, %xmm1
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]			; SSSE3-NEXT: pxor %xmm9, %xmm1
	; SSSE3-NEXT: movd %xmm0, %edx			; SSSE3-NEXT: pxor %xmm9, %xmm0
	; SSSE3-NEXT: xorl %eax, %eax			; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: pand %xmm0, %xmm10
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: pandn %xmm4, %xmm0
	; SSSE3-NEXT: movd %edx, %xmm9			; SSSE3-NEXT: por %xmm10, %xmm0
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]			; SSSE3-NEXT: psubd %xmm4, %xmm0
	; SSSE3-NEXT: movd %xmm0, %ecx			; SSSE3-NEXT: movdqa %xmm5, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]			; SSSE3-NEXT: pxor %xmm9, %xmm4
	; SSSE3-NEXT: movd %xmm0, %edx			; SSSE3-NEXT: movdqa %xmm8, %xmm1
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: pxor %xmm9, %xmm1
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
	; SSSE3-NEXT: movd %edx, %xmm10			; SSSE3-NEXT: pand %xmm1, %xmm8
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]			; SSSE3-NEXT: pandn %xmm5, %xmm1
	; SSSE3-NEXT: movd %xmm4, %ecx			; SSSE3-NEXT: por %xmm8, %xmm1
	; SSSE3-NEXT: movd %xmm1, %edx			; SSSE3-NEXT: psubd %xmm5, %xmm1
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: movdqa %xmm6, %xmm5
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: pxor %xmm9, %xmm5
	; SSSE3-NEXT: movd %edx, %xmm0			; SSSE3-NEXT: movdqa %xmm2, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]			; SSSE3-NEXT: pxor %xmm9, %xmm4
	; SSSE3-NEXT: movd %xmm4, %ecx			; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]			; SSSE3-NEXT: pand %xmm4, %xmm2
	; SSSE3-NEXT: movd %xmm1, %edx			; SSSE3-NEXT: pandn %xmm6, %xmm4
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: por %xmm2, %xmm4
	; SSSE3-NEXT: cmovbl %eax, %edx			; SSSE3-NEXT: psubd %xmm6, %xmm4
	; SSSE3-NEXT: movd %edx, %xmm1			; SSSE3-NEXT: movdqa %xmm7, %xmm2
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; SSSE3-NEXT: pxor %xmm9, %xmm2
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]			; SSSE3-NEXT: pxor %xmm3, %xmm9
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,3]			; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9
	; SSSE3-NEXT: movd %xmm1, %ecx			; SSSE3-NEXT: pand %xmm9, %xmm3
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3]			; SSSE3-NEXT: pandn %xmm7, %xmm9
	; SSSE3-NEXT: movd %xmm1, %edx			; SSSE3-NEXT: por %xmm9, %xmm3
	; SSSE3-NEXT: subl %ecx, %edx			; SSSE3-NEXT: psubd %xmm7, %xmm3
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm1
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
	; SSSE3-NEXT: movd %xmm4, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1]
	; SSSE3-NEXT: movd %xmm4, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm4
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
	; SSSE3-NEXT: movd %xmm5, %ecx
	; SSSE3-NEXT: movd %xmm8, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm1
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
	; SSSE3-NEXT: movd %xmm5, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3]
	; SSSE3-NEXT: movd %xmm5, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm5
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
	; SSSE3-NEXT: movd %xmm4, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,1,2,3]
	; SSSE3-NEXT: movd %xmm4, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
	; SSSE3-NEXT: movd %xmm5, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
	; SSSE3-NEXT: movd %xmm5, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm5
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
	; SSSE3-NEXT: movd %xmm6, %ecx
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
	; SSSE3-NEXT: movd %xmm6, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
	; SSSE3-NEXT: movd %xmm5, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
	; SSSE3-NEXT: movd %xmm5, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm6
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
	; SSSE3-NEXT: movd %xmm7, %ecx
	; SSSE3-NEXT: movd %xmm3, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm5
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %ecx
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,2,3]
	; SSSE3-NEXT: movd %xmm2, %edx
	; SSSE3-NEXT: subl %ecx, %edx
	; SSSE3-NEXT: cmovbl %eax, %edx
	; SSSE3-NEXT: movd %edx, %xmm2
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
	; SSSE3-NEXT: movdqa %xmm4, %xmm2			; SSSE3-NEXT: movdqa %xmm4, %xmm2
	; SSSE3-NEXT: movdqa %xmm5, %xmm3
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v16i32:			; SSE41-LABEL: v16i32:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: movdqa %xmm1, %xmm8			; SSE41-NEXT: pmaxud %xmm4, %xmm0
	; SSE41-NEXT: movdqa %xmm0, %xmm1			; SSE41-NEXT: psubd %xmm4, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm4, %ecx			; SSE41-NEXT: pmaxud %xmm5, %xmm1
	; SSE41-NEXT: pextrd $1, %xmm0, %edx			; SSE41-NEXT: psubd %xmm5, %xmm1
	; SSE41-NEXT: xorl %eax, %eax			; SSE41-NEXT: pmaxud %xmm6, %xmm2
	; SSE41-NEXT: subl %ecx, %edx			; SSE41-NEXT: psubd %xmm6, %xmm2
	; SSE41-NEXT: cmovbl %eax, %edx			; SSE41-NEXT: pmaxud %xmm7, %xmm3
	; SSE41-NEXT: movd %xmm4, %ecx			; SSE41-NEXT: psubd %xmm7, %xmm3
	; SSE41-NEXT: movd %xmm0, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm0
	; SSE41-NEXT: pinsrd $1, %edx, %xmm0
	; SSE41-NEXT: pextrd $2, %xmm4, %ecx
	; SSE41-NEXT: pextrd $2, %xmm1, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm0
	; SSE41-NEXT: pextrd $3, %xmm4, %ecx
	; SSE41-NEXT: pextrd $3, %xmm1, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm0
	; SSE41-NEXT: pextrd $1, %xmm5, %ecx
	; SSE41-NEXT: pextrd $1, %xmm8, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: movd %xmm5, %ecx
	; SSE41-NEXT: movd %xmm8, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm1
	; SSE41-NEXT: pinsrd $1, %edx, %xmm1
	; SSE41-NEXT: pextrd $2, %xmm5, %ecx
	; SSE41-NEXT: pextrd $2, %xmm8, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm1
	; SSE41-NEXT: pextrd $3, %xmm5, %ecx
	; SSE41-NEXT: pextrd $3, %xmm8, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm1
	; SSE41-NEXT: pextrd $1, %xmm6, %ecx
	; SSE41-NEXT: pextrd $1, %xmm2, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: movd %xmm6, %ecx
	; SSE41-NEXT: movd %xmm2, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm4
	; SSE41-NEXT: pinsrd $1, %edx, %xmm4
	; SSE41-NEXT: pextrd $2, %xmm6, %ecx
	; SSE41-NEXT: pextrd $2, %xmm2, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm4
	; SSE41-NEXT: pextrd $3, %xmm6, %ecx
	; SSE41-NEXT: pextrd $3, %xmm2, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm4
	; SSE41-NEXT: pextrd $1, %xmm7, %ecx
	; SSE41-NEXT: pextrd $1, %xmm3, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: movd %xmm7, %ecx
	; SSE41-NEXT: movd %xmm3, %esi
	; SSE41-NEXT: subl %ecx, %esi
	; SSE41-NEXT: cmovbl %eax, %esi
	; SSE41-NEXT: movd %esi, %xmm5
	; SSE41-NEXT: pinsrd $1, %edx, %xmm5
	; SSE41-NEXT: pextrd $2, %xmm7, %ecx
	; SSE41-NEXT: pextrd $2, %xmm3, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $2, %edx, %xmm5
	; SSE41-NEXT: pextrd $3, %xmm7, %ecx
	; SSE41-NEXT: pextrd $3, %xmm3, %edx
	; SSE41-NEXT: subl %ecx, %edx
	; SSE41-NEXT: cmovbl %eax, %edx
	; SSE41-NEXT: pinsrd $3, %edx, %xmm5
	; SSE41-NEXT: movdqa %xmm4, %xmm2
	; SSE41-NEXT: movdqa %xmm5, %xmm3
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: v16i32:			; AVX1-LABEL: v16i32:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4			; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
	; AVX1-NEXT: vpextrd $1, %xmm4, %ecx
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
	; AVX1-NEXT: vpextrd $1, %xmm5, %edx			; AVX1-NEXT: vpmaxud %xmm4, %xmm5, %xmm5
	; AVX1-NEXT: xorl %eax, %eax			; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
	; AVX1-NEXT: subl %ecx, %edx			; AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: cmovbl %eax, %edx			; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vmovd %xmm4, %ecx
	; AVX1-NEXT: vmovd %xmm5, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm6
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
	; AVX1-NEXT: vpextrd $2, %xmm4, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm5, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
	; AVX1-NEXT: vpextrd $3, %xmm4, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm5, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
	; AVX1-NEXT: vpextrd $1, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $1, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vmovd %xmm2, %ecx
	; AVX1-NEXT: vmovd %xmm0, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm5
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX1-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm0, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
	; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2			; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
	; AVX1-NEXT: vpextrd $1, %xmm2, %ecx
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4			; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
	; AVX1-NEXT: vpextrd $1, %xmm4, %edx			; AVX1-NEXT: vpmaxud %xmm2, %xmm4, %xmm4
	; AVX1-NEXT: subl %ecx, %edx			; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
	; AVX1-NEXT: cmovbl %eax, %edx			; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm1
	; AVX1-NEXT: vmovd %xmm2, %ecx			; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
	; AVX1-NEXT: vmovd %xmm4, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm5
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX1-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm4, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm4, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
	; AVX1-NEXT: vpextrd $1, %xmm3, %ecx
	; AVX1-NEXT: vpextrd $1, %xmm1, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vmovd %xmm3, %ecx
	; AVX1-NEXT: vmovd %xmm1, %esi
	; AVX1-NEXT: subl %ecx, %esi
	; AVX1-NEXT: cmovbl %eax, %esi
	; AVX1-NEXT: vmovd %esi, %xmm4
	; AVX1-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX1-NEXT: vpextrd $2, %xmm3, %ecx
	; AVX1-NEXT: vpextrd $2, %xmm1, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX1-NEXT: vpextrd $3, %xmm3, %ecx
	; AVX1-NEXT: vpextrd $3, %xmm1, %edx
	; AVX1-NEXT: subl %ecx, %edx
	; AVX1-NEXT: cmovbl %eax, %edx
	; AVX1-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1			; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: v16i32:			; AVX2-LABEL: v16i32:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4			; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0
	; AVX2-NEXT: vpextrd $1, %xmm4, %ecx			; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5			; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1
	; AVX2-NEXT: vpextrd $1, %xmm5, %edx			; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
	; AVX2-NEXT: xorl %eax, %eax
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm4, %ecx
	; AVX2-NEXT: vmovd %xmm5, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm6
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
	; AVX2-NEXT: vpextrd $2, %xmm4, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm5, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
	; AVX2-NEXT: vpextrd $3, %xmm4, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm5, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
	; AVX2-NEXT: vpextrd $1, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $1, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm2, %ecx
	; AVX2-NEXT: vmovd %xmm0, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm5
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX2-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX2-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm0, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
	; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
	; AVX2-NEXT: vpextrd $1, %xmm2, %ecx
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
	; AVX2-NEXT: vpextrd $1, %xmm4, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm2, %ecx
	; AVX2-NEXT: vmovd %xmm4, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm5
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX2-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm4, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX2-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm4, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
	; AVX2-NEXT: vpextrd $1, %xmm3, %ecx
	; AVX2-NEXT: vpextrd $1, %xmm1, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vmovd %xmm3, %ecx
	; AVX2-NEXT: vmovd %xmm1, %esi
	; AVX2-NEXT: subl %ecx, %esi
	; AVX2-NEXT: cmovbl %eax, %esi
	; AVX2-NEXT: vmovd %esi, %xmm4
	; AVX2-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX2-NEXT: vpextrd $2, %xmm3, %ecx
	; AVX2-NEXT: vpextrd $2, %xmm1, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX2-NEXT: vpextrd $3, %xmm3, %ecx
	; AVX2-NEXT: vpextrd $3, %xmm1, %edx
	; AVX2-NEXT: subl %ecx, %edx
	; AVX2-NEXT: cmovbl %eax, %edx
	; AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: v16i32:			; AVX512-LABEL: v16i32:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2			; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
	; AVX512-NEXT: vpextrd $1, %xmm2, %ecx			; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
	; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3
	; AVX512-NEXT: vpextrd $1, %xmm3, %edx
	; AVX512-NEXT: xorl %eax, %eax
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm2, %ecx
	; AVX512-NEXT: vmovd %xmm3, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm4
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $2, %xmm2, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm3, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $3, %xmm2, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm3, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
	; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3
	; AVX512-NEXT: vpextrd $1, %xmm3, %ecx
	; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm4
	; AVX512-NEXT: vpextrd $1, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm3, %ecx
	; AVX512-NEXT: vmovd %xmm4, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm5
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX512-NEXT: vpextrd $2, %xmm3, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX512-NEXT: vpextrd $3, %xmm3, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
	; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
	; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
	; AVX512-NEXT: vpextrd $1, %xmm3, %ecx
	; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4
	; AVX512-NEXT: vpextrd $1, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm3, %ecx
	; AVX512-NEXT: vmovd %xmm4, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm5
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
	; AVX512-NEXT: vpextrd $2, %xmm3, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
	; AVX512-NEXT: vpextrd $3, %xmm3, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm4, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
	; AVX512-NEXT: vpextrd $1, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $1, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vmovd %xmm1, %ecx
	; AVX512-NEXT: vmovd %xmm0, %esi
	; AVX512-NEXT: subl %ecx, %esi
	; AVX512-NEXT: cmovbl %eax, %esi
	; AVX512-NEXT: vmovd %esi, %xmm4
	; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $2, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $2, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
	; AVX512-NEXT: vpextrd $3, %xmm1, %ecx
	; AVX512-NEXT: vpextrd $3, %xmm0, %edx
	; AVX512-NEXT: subl %ecx, %edx
	; AVX512-NEXT: cmovbl %eax, %edx
	; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0
	; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%z = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)			%z = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
	ret <16 x i32> %z			ret <16 x i32> %z
	}			}

	define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {			define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
	; SSE2-LABEL: v2i64:			; SSE2-LABEL: v2i64:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: movq %xmm1, %rax			; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: movdqa %xmm1, %xmm3
	; SSE2-NEXT: xorl %edx, %edx			; SSE2-NEXT: pxor %xmm2, %xmm3
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pxor %xmm0, %xmm2
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: movdqa %xmm2, %xmm4
	; SSE2-NEXT: movq %rcx, %xmm2			; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSE2-NEXT: movq %xmm1, %rax			; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: pand %xmm5, %xmm2
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: por %xmm2, %xmm3
	; SSE2-NEXT: movq %rcx, %xmm0			; SSE2-NEXT: pand %xmm3, %xmm0
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSE2-NEXT: pandn %xmm1, %xmm3
	; SSE2-NEXT: movdqa %xmm2, %xmm0			; SSE2-NEXT: por %xmm3, %xmm0
				; SSE2-NEXT: psubq %xmm1, %xmm0
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v2i64:			; SSSE3-LABEL: v2i64:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: movq %xmm1, %rax			; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: movdqa %xmm1, %xmm3
	; SSSE3-NEXT: xorl %edx, %edx			; SSSE3-NEXT: pxor %xmm2, %xmm3
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pxor %xmm0, %xmm2
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: movdqa %xmm2, %xmm4
	; SSSE3-NEXT: movq %rcx, %xmm2			; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSSE3-NEXT: movq %xmm1, %rax			; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: pand %xmm5, %xmm2
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: por %xmm2, %xmm3
	; SSSE3-NEXT: movq %rcx, %xmm0			; SSSE3-NEXT: pand %xmm3, %xmm0
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSSE3-NEXT: pandn %xmm1, %xmm3
	; SSSE3-NEXT: movdqa %xmm2, %xmm0			; SSSE3-NEXT: por %xmm3, %xmm0
				; SSSE3-NEXT: psubq %xmm1, %xmm0
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v2i64:			; SSE41-LABEL: v2i64:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: pextrq $1, %xmm1, %rax			; SSE41-NEXT: movdqa %xmm0, %xmm2
	; SSE41-NEXT: pextrq $1, %xmm0, %rcx			; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
	; SSE41-NEXT: xorl %edx, %edx			; SSE41-NEXT: movdqa %xmm1, %xmm3
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pxor %xmm0, %xmm3
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: pxor %xmm2, %xmm0
	; SSE41-NEXT: movq %rcx, %xmm2			; SSE41-NEXT: movdqa %xmm0, %xmm4
	; SSE41-NEXT: movq %xmm1, %rax			; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
	; SSE41-NEXT: movq %xmm0, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
	; SSE41-NEXT: movq %rcx, %xmm0			; SSE41-NEXT: pand %xmm5, %xmm0
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]			; SSE41-NEXT: por %xmm4, %xmm0
				; SSE41-NEXT: movdqa %xmm1, %xmm3
				; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
				; SSE41-NEXT: psubq %xmm1, %xmm3
				; SSE41-NEXT: movdqa %xmm3, %xmm0
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: v2i64:			; AVX1-LABEL: v2i64:
	; AVX: # %bb.0:			; AVX1: # %bb.0:
	; AVX-NEXT: vpextrq $1, %xmm1, %rax			; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
	; AVX-NEXT: vpextrq $1, %xmm0, %rcx			; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
	; AVX-NEXT: xorl %edx, %edx			; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
	; AVX-NEXT: subq %rax, %rcx			; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
	; AVX-NEXT: cmovbq %rdx, %rcx			; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
	; AVX-NEXT: vmovq %rcx, %xmm2			; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vmovq %xmm1, %rax			; AVX1-NEXT: retq
	; AVX-NEXT: vmovq %xmm0, %rcx			;
	; AVX-NEXT: subq %rax, %rcx			; AVX2-LABEL: v2i64:
	; AVX-NEXT: cmovbq %rdx, %rcx			; AVX2: # %bb.0:
	; AVX-NEXT: vmovq %rcx, %xmm0			; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
	; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]			; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
	; AVX-NEXT: retq			; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
				; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
				; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
				; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
				; AVX2-NEXT: retq
				;
				; AVX512-LABEL: v2i64:
				; AVX512: # %bb.0:
				; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
				; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
				; AVX512-NEXT: retq
	%z = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)			%z = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %x, <2 x i64> %y)
	ret <2 x i64> %z			ret <2 x i64> %z
	}			}

	define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {			define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
	; SSE2-LABEL: v4i64:			; SSE2-LABEL: v4i64:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: movq %xmm2, %rax			; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: movdqa %xmm2, %xmm5
	; SSE2-NEXT: xorl %edx, %edx			; SSE2-NEXT: pxor %xmm4, %xmm5
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: movdqa %xmm0, %xmm6
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: pxor %xmm4, %xmm6
	; SSE2-NEXT: movq %rcx, %xmm4			; SSE2-NEXT: movdqa %xmm6, %xmm7
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]			; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
	; SSE2-NEXT: movq %xmm2, %rax			; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pand %xmm8, %xmm5
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
	; SSE2-NEXT: movq %rcx, %xmm0			; SSE2-NEXT: por %xmm5, %xmm6
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]			; SSE2-NEXT: pand %xmm6, %xmm0
	; SSE2-NEXT: movq %xmm3, %rax			; SSE2-NEXT: pandn %xmm2, %xmm6
	; SSE2-NEXT: movq %xmm1, %rcx			; SSE2-NEXT: por %xmm6, %xmm0
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: psubq %xmm2, %xmm0
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: movdqa %xmm3, %xmm2
	; SSE2-NEXT: movq %rcx, %xmm2			; SSE2-NEXT: pxor %xmm4, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]			; SSE2-NEXT: pxor %xmm1, %xmm4
	; SSE2-NEXT: movq %xmm0, %rax			; SSE2-NEXT: movdqa %xmm4, %xmm5
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]			; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
	; SSE2-NEXT: movq %xmm0, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
	; SSE2-NEXT: subq %rax, %rcx			; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
	; SSE2-NEXT: cmovbq %rdx, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
	; SSE2-NEXT: movq %rcx, %xmm0			; SSE2-NEXT: pand %xmm6, %xmm2
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
	; SSE2-NEXT: movdqa %xmm4, %xmm0			; SSE2-NEXT: por %xmm2, %xmm4
	; SSE2-NEXT: movdqa %xmm2, %xmm1			; SSE2-NEXT: pand %xmm4, %xmm1
				; SSE2-NEXT: pandn %xmm3, %xmm4
				; SSE2-NEXT: por %xmm4, %xmm1
				; SSE2-NEXT: psubq %xmm3, %xmm1
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v4i64:			; SSSE3-LABEL: v4i64:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: movq %xmm2, %rax			; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: movdqa %xmm2, %xmm5
	; SSSE3-NEXT: xorl %edx, %edx			; SSSE3-NEXT: pxor %xmm4, %xmm5
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: movdqa %xmm0, %xmm6
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: pxor %xmm4, %xmm6
	; SSSE3-NEXT: movq %rcx, %xmm4			; SSSE3-NEXT: movdqa %xmm6, %xmm7
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]			; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
	; SSSE3-NEXT: movq %xmm2, %rax			; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]			; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pand %xmm8, %xmm5
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
	; SSSE3-NEXT: movq %rcx, %xmm0			; SSSE3-NEXT: por %xmm5, %xmm6
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]			; SSSE3-NEXT: pand %xmm6, %xmm0
	; SSSE3-NEXT: movq %xmm3, %rax			; SSSE3-NEXT: pandn %xmm2, %xmm6
	; SSSE3-NEXT: movq %xmm1, %rcx			; SSSE3-NEXT: por %xmm6, %xmm0
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: psubq %xmm2, %xmm0
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: movdqa %xmm3, %xmm2
	; SSSE3-NEXT: movq %rcx, %xmm2			; SSSE3-NEXT: pxor %xmm4, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]			; SSSE3-NEXT: pxor %xmm1, %xmm4
	; SSSE3-NEXT: movq %xmm0, %rax			; SSSE3-NEXT: movdqa %xmm4, %xmm5
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]			; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
	; SSSE3-NEXT: movq %xmm0, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
	; SSSE3-NEXT: subq %rax, %rcx			; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
	; SSSE3-NEXT: cmovbq %rdx, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
	; SSSE3-NEXT: movq %rcx, %xmm0			; SSSE3-NEXT: pand %xmm6, %xmm2
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]			; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
	; SSSE3-NEXT: movdqa %xmm4, %xmm0			; SSSE3-NEXT: por %xmm2, %xmm4
	; SSSE3-NEXT: movdqa %xmm2, %xmm1			; SSSE3-NEXT: pand %xmm4, %xmm1
				; SSSE3-NEXT: pandn %xmm3, %xmm4
				; SSSE3-NEXT: por %xmm4, %xmm1
				; SSSE3-NEXT: psubq %xmm3, %xmm1
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v4i64:			; SSE41-LABEL: v4i64:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: pextrq $1, %xmm2, %rax			; SSE41-NEXT: movdqa %xmm0, %xmm4
	; SSE41-NEXT: pextrq $1, %xmm0, %rcx			; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
	; SSE41-NEXT: xorl %edx, %edx			; SSE41-NEXT: movdqa %xmm2, %xmm5
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pxor %xmm6, %xmm5
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: movdqa %xmm0, %xmm7
	; SSE41-NEXT: movq %rcx, %xmm4			; SSE41-NEXT: pxor %xmm6, %xmm7
	; SSE41-NEXT: movq %xmm2, %rax			; SSE41-NEXT: movdqa %xmm7, %xmm0
	; SSE41-NEXT: movq %xmm0, %rcx			; SSE41-NEXT: pcmpgtd %xmm5, %xmm0
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: pcmpeqd %xmm5, %xmm7
	; SSE41-NEXT: movq %rcx, %xmm0			; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]			; SSE41-NEXT: pand %xmm8, %xmm5
	; SSE41-NEXT: pextrq $1, %xmm3, %rax			; SSE41-NEXT: por %xmm5, %xmm0
	; SSE41-NEXT: pextrq $1, %xmm1, %rcx			; SSE41-NEXT: movdqa %xmm2, %xmm5
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: psubq %xmm2, %xmm5
	; SSE41-NEXT: movq %rcx, %xmm2			; SSE41-NEXT: movdqa %xmm3, %xmm0
	; SSE41-NEXT: movq %xmm3, %rax			; SSE41-NEXT: pxor %xmm6, %xmm0
	; SSE41-NEXT: movq %xmm1, %rcx			; SSE41-NEXT: pxor %xmm1, %xmm6
	; SSE41-NEXT: subq %rax, %rcx			; SSE41-NEXT: movdqa %xmm6, %xmm2
	; SSE41-NEXT: cmovbq %rdx, %rcx			; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
	; SSE41-NEXT: movq %rcx, %xmm1			; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]			; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
				; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
				; SSE41-NEXT: pand %xmm4, %xmm0
				; SSE41-NEXT: por %xmm2, %xmm0
				; SSE41-NEXT: movdqa %xmm3, %xmm2
				; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
				; SSE41-NEXT: psubq %xmm3, %xmm2
				; SSE41-NEXT: movdqa %xmm5, %xmm0
				; SSE41-NEXT: movdqa %xmm2, %xmm1
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: v4i64:			; AVX1-LABEL: v4i64:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2			; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpextrq $1, %xmm2, %rax			; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
				; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
				; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
				; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm5
				; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
				; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
				; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
				; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm3
				; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
				; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpextrq $1, %xmm3, %rcx			; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
	; AVX1-NEXT: xorl %edx, %edx			; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: subq %rax, %rcx
	; AVX1-NEXT: cmovbq %rdx, %rcx
	; AVX1-NEXT: vmovq %rcx, %xmm4
	; AVX1-NEXT: vmovq %xmm2, %rax
	; AVX1-NEXT: vmovq %xmm3, %rcx
	; AVX1-NEXT: subq %rax, %rcx
	; AVX1-NEXT: cmovbq %rdx, %rcx
	; AVX1-NEXT: vmovq %rcx, %xmm2
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; AVX1-NEXT: vpextrq $1, %xmm1, %rax
	; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
	; AVX1-NEXT: subq %rax, %rcx
	; AVX1-NEXT: cmovbq %rdx, %rcx
	; AVX1-NEXT: vmovq %rcx, %xmm3
	; AVX1-NEXT: vmovq %xmm1, %rax
	; AVX1-NEXT: vmovq %xmm0, %rcx
	; AVX1-NEXT: subq %rax, %rcx
	; AVX1-NEXT: cmovbq %rdx, %rcx
	; AVX1-NEXT: vmovq %rcx, %xmm0
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: v4i64:			; AVX2-LABEL: v4i64:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2			; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
	; AVX2-NEXT: vpextrq $1, %xmm2, %rax			; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3			; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
	; AVX2-NEXT: vpextrq $1, %xmm3, %rcx			; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
	; AVX2-NEXT: xorl %edx, %edx			; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: subq %rax, %rcx			; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: cmovbq %rdx, %rcx
	; AVX2-NEXT: vmovq %rcx, %xmm4
	; AVX2-NEXT: vmovq %xmm2, %rax
	; AVX2-NEXT: vmovq %xmm3, %rcx
	; AVX2-NEXT: subq %rax, %rcx
	; AVX2-NEXT: cmovbq %rdx, %rcx
	; AVX2-NEXT: vmovq %rcx, %xmm2
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; AVX2-NEXT: vpextrq $1, %xmm1, %rax
	; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
	; AVX2-NEXT: subq %rax, %rcx
	; AVX2-NEXT: cmovbq %rdx, %rcx
	; AVX2-NEXT: vmovq %rcx, %xmm3
	; AVX2-NEXT: vmovq %xmm1, %rax
	; AVX2-NEXT: vmovq %xmm0, %rcx
	; AVX2-NEXT: subq %rax, %rcx
	; AVX2-NEXT: cmovbq %rdx, %rcx
	; AVX2-NEXT: vmovq %rcx, %xmm0
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: v4i64:			; AVX512-LABEL: v4i64:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2			; AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
	; AVX512-NEXT: vpextrq $1, %xmm2, %rax			; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
	; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
	; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
	; AVX512-NEXT: xorl %edx, %edx
	; AVX512-NEXT: subq %rax, %rcx
	; AVX512-NEXT: cmovbq %rdx, %rcx
	; AVX512-NEXT: vmovq %rcx, %xmm4
	; AVX512-NEXT: vmovq %xmm2, %rax
	; AVX512-NEXT: vmovq %xmm3, %rcx
	; AVX512-NEXT: subq %rax, %rcx
	; AVX512-NEXT: cmovbq %rdx, %rcx
	; AVX512-NEXT: vmovq %rcx, %xmm2
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; AVX512-NEXT: vpextrq $1, %xmm1, %rax
	; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
	; AVX512-NEXT: subq %rax, %rcx
	; AVX512-NEXT: cmovbq %rdx, %rcx
	; AVX512-NEXT: vmovq %rcx, %xmm3
	; AVX512-NEXT: vmovq %xmm1, %rax
	; AVX512-NEXT: vmovq %xmm0, %rcx
	; AVX512-NEXT: subq %rax, %rcx
	; AVX512-NEXT: cmovbq %rdx, %rcx
	; AVX512-NEXT: vmovq %rcx, %xmm0
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
	; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%z = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)			%z = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
	ret <4 x i64> %z			ret <4 x i64> %z
	}			}

	define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {			define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
	; SSE2-LABEL: v8i64:			; SSE2-LABEL: v8i64:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: movdqa %xmm1, %xmm8			; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
	; SSE2-NEXT: movdqa %xmm0, %xmm1			; SSE2-NEXT: movdqa %xmm4, %xmm9
	; SSE2-NEXT: movq %xmm4, %rcx			; SSE2-NEXT: pxor %xmm8, %xmm9
	; SSE2-NEXT: movq %xmm0, %rdx			; SSE2-NEXT: movdqa %xmm0, %xmm10
	; SSE2-NEXT: xorl %eax, %eax			; SSE2-NEXT: pxor %xmm8, %xmm10
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: movdqa %xmm10, %xmm11
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: pcmpgtd %xmm9, %xmm11
	; SSE2-NEXT: movq %rdx, %xmm0			; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]			; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
	; SSE2-NEXT: movq %xmm4, %rcx			; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSE2-NEXT: pand %xmm12, %xmm9
	; SSE2-NEXT: movq %xmm1, %rdx			; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: por %xmm9, %xmm10
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: pand %xmm10, %xmm0
	; SSE2-NEXT: movq %rdx, %xmm1			; SSE2-NEXT: pandn %xmm4, %xmm10
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; SSE2-NEXT: por %xmm10, %xmm0
	; SSE2-NEXT: movq %xmm5, %rcx			; SSE2-NEXT: psubq %xmm4, %xmm0
	; SSE2-NEXT: movq %xmm8, %rdx			; SSE2-NEXT: movdqa %xmm5, %xmm9
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: pxor %xmm8, %xmm9
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: movdqa %xmm1, %xmm4
	; SSE2-NEXT: movq %rdx, %xmm1			; SSE2-NEXT: pxor %xmm8, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]			; SSE2-NEXT: movdqa %xmm4, %xmm10
	; SSE2-NEXT: movq %xmm4, %rcx			; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1]			; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
	; SSE2-NEXT: movq %xmm4, %rdx			; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: pand %xmm11, %xmm9
	; SSE2-NEXT: movq %rdx, %xmm4			; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]			; SSE2-NEXT: por %xmm9, %xmm4
	; SSE2-NEXT: movq %xmm6, %rcx			; SSE2-NEXT: pand %xmm4, %xmm1
	; SSE2-NEXT: movq %xmm2, %rdx			; SSE2-NEXT: pandn %xmm5, %xmm4
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: por %xmm4, %xmm1
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: psubq %xmm5, %xmm1
	; SSE2-NEXT: movq %rdx, %xmm4			; SSE2-NEXT: movdqa %xmm6, %xmm4
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]			; SSE2-NEXT: pxor %xmm8, %xmm4
	; SSE2-NEXT: movq %xmm5, %rcx			; SSE2-NEXT: movdqa %xmm2, %xmm5
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]			; SSE2-NEXT: pxor %xmm8, %xmm5
	; SSE2-NEXT: movq %xmm2, %rdx			; SSE2-NEXT: movdqa %xmm5, %xmm9
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
	; SSE2-NEXT: movq %rdx, %xmm2			; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]			; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
	; SSE2-NEXT: movq %xmm7, %rcx			; SSE2-NEXT: pand %xmm10, %xmm4
	; SSE2-NEXT: movq %xmm3, %rdx			; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: por %xmm4, %xmm5
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: pand %xmm5, %xmm2
	; SSE2-NEXT: movq %rdx, %xmm5			; SSE2-NEXT: pandn %xmm6, %xmm5
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1]			; SSE2-NEXT: por %xmm5, %xmm2
	; SSE2-NEXT: movq %xmm2, %rcx			; SSE2-NEXT: psubq %xmm6, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]			; SSE2-NEXT: movdqa %xmm7, %xmm4
	; SSE2-NEXT: movq %xmm2, %rdx			; SSE2-NEXT: pxor %xmm8, %xmm4
	; SSE2-NEXT: subq %rcx, %rdx			; SSE2-NEXT: pxor %xmm3, %xmm8
	; SSE2-NEXT: cmovbq %rax, %rdx			; SSE2-NEXT: movdqa %xmm8, %xmm5
	; SSE2-NEXT: movq %rdx, %xmm2			; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]			; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
	; SSE2-NEXT: movdqa %xmm4, %xmm2			; SSE2-NEXT: pcmpeqd %xmm4, %xmm8
	; SSE2-NEXT: movdqa %xmm5, %xmm3			; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
				; SSE2-NEXT: pand %xmm6, %xmm4
				; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
				; SSE2-NEXT: por %xmm4, %xmm5
				; SSE2-NEXT: pand %xmm5, %xmm3
				; SSE2-NEXT: pandn %xmm7, %xmm5
				; SSE2-NEXT: por %xmm5, %xmm3
				; SSE2-NEXT: psubq %xmm7, %xmm3
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSSE3-LABEL: v8i64:			; SSSE3-LABEL: v8i64:
	; SSSE3: # %bb.0:			; SSSE3: # %bb.0:
	; SSSE3-NEXT: movdqa %xmm1, %xmm8			; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
	; SSSE3-NEXT: movdqa %xmm0, %xmm1			; SSSE3-NEXT: movdqa %xmm4, %xmm9
	; SSSE3-NEXT: movq %xmm4, %rcx			; SSSE3-NEXT: pxor %xmm8, %xmm9
	; SSSE3-NEXT: movq %xmm0, %rdx			; SSSE3-NEXT: movdqa %xmm0, %xmm10
	; SSSE3-NEXT: xorl %eax, %eax			; SSSE3-NEXT: pxor %xmm8, %xmm10
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: movdqa %xmm10, %xmm11
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11
	; SSSE3-NEXT: movq %rdx, %xmm0			; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]			; SSSE3-NEXT: pcmpeqd %xmm9, %xmm10
	; SSSE3-NEXT: movq %xmm4, %rcx			; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]			; SSSE3-NEXT: pand %xmm12, %xmm9
	; SSSE3-NEXT: movq %xmm1, %rdx			; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: por %xmm9, %xmm10
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: pand %xmm10, %xmm0
	; SSSE3-NEXT: movq %rdx, %xmm1			; SSSE3-NEXT: pandn %xmm4, %xmm10
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]			; SSSE3-NEXT: por %xmm10, %xmm0
	; SSSE3-NEXT: movq %xmm5, %rcx			; SSSE3-NEXT: psubq %xmm4, %xmm0
	; SSSE3-NEXT: movq %xmm8, %rdx			; SSSE3-NEXT: movdqa %xmm5, %xmm9
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: pxor %xmm8, %xmm9
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: movdqa %xmm1, %xmm4
	; SSSE3-NEXT: movq %rdx, %xmm1			; SSSE3-NEXT: pxor %xmm8, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]			; SSSE3-NEXT: movdqa %xmm4, %xmm10
	; SSSE3-NEXT: movq %xmm4, %rcx			; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10
	; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1]			; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
	; SSSE3-NEXT: movq %xmm4, %rdx			; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: pand %xmm11, %xmm9
	; SSSE3-NEXT: movq %rdx, %xmm4			; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]			; SSSE3-NEXT: por %xmm9, %xmm4
	; SSSE3-NEXT: movq %xmm6, %rcx			; SSSE3-NEXT: pand %xmm4, %xmm1
	; SSSE3-NEXT: movq %xmm2, %rdx			; SSSE3-NEXT: pandn %xmm5, %xmm4
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: por %xmm4, %xmm1
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: psubq %xmm5, %xmm1
	; SSSE3-NEXT: movq %rdx, %xmm4			; SSSE3-NEXT: movdqa %xmm6, %xmm4
	; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]			; SSSE3-NEXT: pxor %xmm8, %xmm4
	; SSSE3-NEXT: movq %xmm5, %rcx			; SSSE3-NEXT: movdqa %xmm2, %xmm5
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]			; SSSE3-NEXT: pxor %xmm8, %xmm5
	; SSSE3-NEXT: movq %xmm2, %rdx			; SSSE3-NEXT: movdqa %xmm5, %xmm9
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
	; SSSE3-NEXT: movq %rdx, %xmm2			; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]			; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
	; SSSE3-NEXT: movq %xmm7, %rcx			; SSSE3-NEXT: pand %xmm10, %xmm4
	; SSSE3-NEXT: movq %xmm3, %rdx			; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: por %xmm4, %xmm5
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: pand %xmm5, %xmm2
	; SSSE3-NEXT: movq %rdx, %xmm5			; SSSE3-NEXT: pandn %xmm6, %xmm5
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1]			; SSSE3-NEXT: por %xmm5, %xmm2
	; SSSE3-NEXT: movq %xmm2, %rcx			; SSSE3-NEXT: psubq %xmm6, %xmm2
	; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]			; SSSE3-NEXT: movdqa %xmm7, %xmm4
	; SSSE3-NEXT: movq %xmm2, %rdx			; SSSE3-NEXT: pxor %xmm8, %xmm4
	; SSSE3-NEXT: subq %rcx, %rdx			; SSSE3-NEXT: pxor %xmm3, %xmm8
	; SSSE3-NEXT: cmovbq %rax, %rdx			; SSSE3-NEXT: movdqa %xmm8, %xmm5
	; SSSE3-NEXT: movq %rdx, %xmm2			; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]			; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
	; SSSE3-NEXT: movdqa %xmm4, %xmm2			; SSSE3-NEXT: pcmpeqd %xmm4, %xmm8
	; SSSE3-NEXT: movdqa %xmm5, %xmm3			; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
				; SSSE3-NEXT: pand %xmm6, %xmm4
				; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
				; SSSE3-NEXT: por %xmm4, %xmm5
				; SSSE3-NEXT: pand %xmm5, %xmm3
				; SSSE3-NEXT: pandn %xmm7, %xmm5
				; SSSE3-NEXT: por %xmm5, %xmm3
				; SSSE3-NEXT: psubq %xmm7, %xmm3
	; SSSE3-NEXT: retq			; SSSE3-NEXT: retq
	;			;
	; SSE41-LABEL: v8i64:			; SSE41-LABEL: v8i64:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: pextrq $1, %xmm4, %rcx			; SSE41-NEXT: movdqa %xmm1, %xmm8
	; SSE41-NEXT: pextrq $1, %xmm0, %rdx			; SSE41-NEXT: movdqa %xmm0, %xmm11
	; SSE41-NEXT: xorl %eax, %eax			; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: movdqa %xmm4, %xmm9
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: pxor %xmm10, %xmm9
	; SSE41-NEXT: movq %rdx, %xmm8			; SSE41-NEXT: movdqa %xmm0, %xmm1
	; SSE41-NEXT: movq %xmm4, %rcx			; SSE41-NEXT: pxor %xmm10, %xmm1
	; SSE41-NEXT: movq %xmm0, %rdx			; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: pcmpgtd %xmm9, %xmm0
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2]
	; SSE41-NEXT: movq %rdx, %xmm0			; SSE41-NEXT: pcmpeqd %xmm9, %xmm1
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0]			; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
	; SSE41-NEXT: pextrq $1, %xmm5, %rcx			; SSE41-NEXT: pand %xmm12, %xmm1
	; SSE41-NEXT: pextrq $1, %xmm1, %rdx			; SSE41-NEXT: por %xmm1, %xmm0
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: movdqa %xmm4, %xmm9
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm9
	; SSE41-NEXT: movq %rdx, %xmm4			; SSE41-NEXT: psubq %xmm4, %xmm9
	; SSE41-NEXT: movq %xmm5, %rcx			; SSE41-NEXT: movdqa %xmm5, %xmm0
	; SSE41-NEXT: movq %xmm1, %rdx			; SSE41-NEXT: pxor %xmm10, %xmm0
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: movdqa %xmm8, %xmm1
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: pxor %xmm10, %xmm1
	; SSE41-NEXT: movq %rdx, %xmm1			; SSE41-NEXT: movdqa %xmm1, %xmm4
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]			; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
	; SSE41-NEXT: pextrq $1, %xmm6, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
	; SSE41-NEXT: pextrq $1, %xmm2, %rdx			; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: pand %xmm11, %xmm0
	; SSE41-NEXT: movq %rdx, %xmm4			; SSE41-NEXT: por %xmm4, %xmm0
	; SSE41-NEXT: movq %xmm6, %rcx			; SSE41-NEXT: movdqa %xmm5, %xmm1
	; SSE41-NEXT: movq %xmm2, %rdx			; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: psubq %xmm5, %xmm1
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: movdqa %xmm6, %xmm0
	; SSE41-NEXT: movq %rdx, %xmm2			; SSE41-NEXT: pxor %xmm10, %xmm0
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]			; SSE41-NEXT: movdqa %xmm2, %xmm4
	; SSE41-NEXT: pextrq $1, %xmm7, %rcx			; SSE41-NEXT: pxor %xmm10, %xmm4
	; SSE41-NEXT: pextrq $1, %xmm3, %rdx			; SSE41-NEXT: movdqa %xmm4, %xmm5
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
	; SSE41-NEXT: movq %rdx, %xmm4			; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
	; SSE41-NEXT: movq %xmm7, %rcx			; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
	; SSE41-NEXT: movq %xmm3, %rdx			; SSE41-NEXT: pand %xmm8, %xmm0
	; SSE41-NEXT: subq %rcx, %rdx			; SSE41-NEXT: por %xmm5, %xmm0
	; SSE41-NEXT: cmovbq %rax, %rdx			; SSE41-NEXT: movdqa %xmm6, %xmm4
	; SSE41-NEXT: movq %rdx, %xmm3			; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]			; SSE41-NEXT: psubq %xmm6, %xmm4
				; SSE41-NEXT: movdqa %xmm7, %xmm0
				; SSE41-NEXT: pxor %xmm10, %xmm0
				; SSE41-NEXT: pxor %xmm3, %xmm10
				; SSE41-NEXT: movdqa %xmm10, %xmm2
				; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
				; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
				; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
				; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
				; SSE41-NEXT: pand %xmm5, %xmm0
				; SSE41-NEXT: por %xmm2, %xmm0
				; SSE41-NEXT: movdqa %xmm7, %xmm5
				; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
				; SSE41-NEXT: psubq %xmm7, %xmm5
				; SSE41-NEXT: movdqa %xmm9, %xmm0
				; SSE41-NEXT: movdqa %xmm4, %xmm2
				; SSE41-NEXT: movdqa %xmm5, %xmm3
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: v8i64:			; AVX1-LABEL: v8i64:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4			; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
	; AVX1-NEXT: vpextrq $1, %xmm4, %rcx			; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5			; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6
	; AVX1-NEXT: vpextrq $1, %xmm5, %rdx			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
	; AVX1-NEXT: xorl %eax, %eax			; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm7
	; AVX1-NEXT: subq %rcx, %rdx			; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm8
	; AVX1-NEXT: cmovbq %rax, %rdx			; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm7
	; AVX1-NEXT: vmovq %rdx, %xmm6			; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm6
	; AVX1-NEXT: vmovq %xmm4, %rcx			; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6
	; AVX1-NEXT: vmovq %xmm5, %rdx			; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
	; AVX1-NEXT: subq %rcx, %rdx			; AVX1-NEXT: vblendvpd %ymm6, %ymm0, %ymm2, %ymm0
	; AVX1-NEXT: cmovbq %rax, %rdx			; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
	; AVX1-NEXT: vmovq %rdx, %xmm4			; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm4
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]			; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vpextrq $1, %xmm2, %rcx
	; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
	; AVX1-NEXT: subq %rcx, %rdx
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm5
	; AVX1-NEXT: vmovq %xmm2, %rcx
	; AVX1-NEXT: vmovq %xmm0, %rdx
	; AVX1-NEXT: subq %rcx, %rdx
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm0
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0			; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
	; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2			; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
	; AVX1-NEXT: vpextrq $1, %xmm2, %rcx			; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4
				; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
				; AVX1-NEXT: vpxor %xmm5, %xmm6, %xmm6
				; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
				; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm6
				; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm5
				; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm5
				; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
				; AVX1-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4			; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
	; AVX1-NEXT: vpextrq $1, %xmm4, %rdx			; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
	; AVX1-NEXT: subq %rcx, %rdx			; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm5
	; AVX1-NEXT: vmovq %xmm2, %rcx
	; AVX1-NEXT: vmovq %xmm4, %rdx
	; AVX1-NEXT: subq %rcx, %rdx
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm2
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
	; AVX1-NEXT: vpextrq $1, %xmm3, %rcx
	; AVX1-NEXT: vpextrq $1, %xmm1, %rdx
	; AVX1-NEXT: subq %rcx, %rdx
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm4
	; AVX1-NEXT: vmovq %xmm3, %rcx
	; AVX1-NEXT: vmovq %xmm1, %rdx
	; AVX1-NEXT: subq %rcx, %rdx
	; AVX1-NEXT: cmovbq %rax, %rdx
	; AVX1-NEXT: vmovq %rdx, %xmm1
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1			; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: v8i64:			; AVX2-LABEL: v8i64:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4			; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
	; AVX2-NEXT: vpextrq $1, %xmm4, %rcx			; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5			; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm6
	; AVX2-NEXT: vpextrq $1, %xmm5, %rdx			; AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5
	; AVX2-NEXT: xorl %eax, %eax			; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
	; AVX2-NEXT: subq %rcx, %rdx			; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
	; AVX2-NEXT: cmovbq %rax, %rdx			; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm2
	; AVX2-NEXT: vmovq %rdx, %xmm6			; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm4
	; AVX2-NEXT: vmovq %xmm4, %rcx			; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm2
	; AVX2-NEXT: vmovq %xmm5, %rdx			; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm3, %ymm1
	; AVX2-NEXT: subq %rcx, %rdx			; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm4
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
	; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
	; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm5
	; AVX2-NEXT: vmovq %xmm2, %rcx
	; AVX2-NEXT: vmovq %xmm0, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm0
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
	; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
	; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
	; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
	; AVX2-NEXT: vpextrq $1, %xmm4, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm5
	; AVX2-NEXT: vmovq %xmm2, %rcx
	; AVX2-NEXT: vmovq %xmm4, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm2
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
	; AVX2-NEXT: vpextrq $1, %xmm3, %rcx
	; AVX2-NEXT: vpextrq $1, %xmm1, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm4
	; AVX2-NEXT: vmovq %xmm3, %rcx
	; AVX2-NEXT: vmovq %xmm1, %rdx
	; AVX2-NEXT: subq %rcx, %rdx
	; AVX2-NEXT: cmovbq %rax, %rdx
	; AVX2-NEXT: vmovq %rdx, %xmm1
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	;			;
	; AVX512-LABEL: v8i64:			; AVX512-LABEL: v8i64:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm2			; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
	; AVX512-NEXT: vpextrq $1, %xmm2, %rcx			; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
	; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm3
	; AVX512-NEXT: vpextrq $1, %xmm3, %rdx
	; AVX512-NEXT: xorl %eax, %eax
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm4
	; AVX512-NEXT: vmovq %xmm2, %rcx
	; AVX512-NEXT: vmovq %xmm3, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm2
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3
	; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
	; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm4
	; AVX512-NEXT: vpextrq $1, %xmm4, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm5
	; AVX512-NEXT: vmovq %xmm3, %rcx
	; AVX512-NEXT: vmovq %xmm4, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm3
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
	; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
	; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
	; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
	; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4
	; AVX512-NEXT: vpextrq $1, %xmm4, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm5
	; AVX512-NEXT: vmovq %xmm3, %rcx
	; AVX512-NEXT: vmovq %xmm4, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm3
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
	; AVX512-NEXT: vpextrq $1, %xmm1, %rcx
	; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm4
	; AVX512-NEXT: vmovq %xmm1, %rcx
	; AVX512-NEXT: vmovq %xmm0, %rdx
	; AVX512-NEXT: subq %rcx, %rdx
	; AVX512-NEXT: cmovbq %rax, %rdx
	; AVX512-NEXT: vmovq %rdx, %xmm0
	; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
	; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%z = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)			%z = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
	ret <8 x i64> %z			ret <8 x i64> %z
	}			}

	define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {			define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
	; SSE-LABEL: v2i128:			; SSE-LABEL: v2i128:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	Show All 36 Lines

llvm/trunk/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll

Show All 20 Lines
@c8 = common global [64 x i8] zeroinitializer, align 64		@c8 = common global [64 x i8] zeroinitializer, align 64

declare i64 @llvm.usub.sat.i64(i64, i64)		declare i64 @llvm.usub.sat.i64(i64, i64)
declare i32 @llvm.usub.sat.i32(i32, i32)		declare i32 @llvm.usub.sat.i32(i32, i32)
declare i16 @llvm.usub.sat.i16(i16, i16)		declare i16 @llvm.usub.sat.i16(i16, i16)
declare i8 @llvm.usub.sat.i8 (i8 , i8 )		declare i8 @llvm.usub.sat.i8 (i8 , i8 )

define void @sub_v8i64() {		define void @sub_v8i64() {
; CHECK-LABEL: @sub_v8i64(		; SSE-LABEL: @sub_v8i64(
; CHECK-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		; SSE-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
; CHECK-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		; SSE-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
; CHECK-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		; SSE-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
; CHECK-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		; SSE-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
; CHECK-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		; SSE-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
; CHECK-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		; SSE-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
; CHECK-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8		; SSE-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
; CHECK-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8		; SSE-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
; CHECK-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8		; SSE-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
; CHECK-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8		; SSE-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
; CHECK-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8		; SSE-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
; CHECK-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8		; SSE-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
; CHECK-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8		; SSE-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
; CHECK-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8		; SSE-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
; CHECK-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8		; SSE-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
; CHECK-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8		; SSE-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
; CHECK-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])		; SSE-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
; CHECK-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])		; SSE-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
; CHECK-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])		; SSE-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
; CHECK-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])		; SSE-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
; CHECK-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])		; SSE-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
; CHECK-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])		; SSE-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
; CHECK-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])		; SSE-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
; CHECK-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])		; SSE-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
; CHECK-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8		; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
; CHECK-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8		; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
; CHECK-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8		; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
; CHECK-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8		; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
; CHECK-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
; CHECK-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
; CHECK-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
; CHECK-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; CHECK-NEXT: ret void		; SSE-NEXT: ret void
		;
		; SLM-LABEL: @sub_v8i64(
		; SLM-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
		; SLM-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
		; SLM-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
		; SLM-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
		; SLM-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
		; SLM-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
		; SLM-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
		; SLM-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
		; SLM-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
		; SLM-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
		; SLM-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
		; SLM-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
		; SLM-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
		; SLM-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
		; SLM-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
		; SLM-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
		; SLM-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
		; SLM-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
		; SLM-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
		; SLM-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
		; SLM-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
		; SLM-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
		; SLM-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
		; SLM-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
		; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
		; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
		; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
		; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
		; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
		; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
		; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
		; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
		; SLM-NEXT: ret void
		;
		; AVX1-LABEL: @sub_v8i64(
		; AVX1-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
		; AVX1-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
		; AVX1-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
		; AVX1-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
		; AVX1-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
		; AVX1-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
		; AVX1-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
		; AVX1-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
		; AVX1-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
		; AVX1-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
		; AVX1-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
		; AVX1-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
		; AVX1-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
		; AVX1-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
		; AVX1-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
		; AVX1-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
		; AVX1-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
		; AVX1-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
		; AVX1-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
		; AVX1-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
		; AVX1-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
		; AVX1-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
		; AVX1-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
		; AVX1-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
		; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
		; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
		; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
		; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
		; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
		; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
		; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
		; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
		; AVX1-NEXT: ret void
		;
		; AVX2-LABEL: @sub_v8i64(
		; AVX2-NEXT: [[A0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
		; AVX2-NEXT: [[A1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
		; AVX2-NEXT: [[A2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
		; AVX2-NEXT: [[A3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
		; AVX2-NEXT: [[A4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
		; AVX2-NEXT: [[A5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
		; AVX2-NEXT: [[A6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
		; AVX2-NEXT: [[A7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
		; AVX2-NEXT: [[B0:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
		; AVX2-NEXT: [[B1:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
		; AVX2-NEXT: [[B2:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
		; AVX2-NEXT: [[B3:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
		; AVX2-NEXT: [[B4:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
		; AVX2-NEXT: [[B5:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
		; AVX2-NEXT: [[B6:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
		; AVX2-NEXT: [[B7:%.]] = load i64, i64 getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
		; AVX2-NEXT: [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
		; AVX2-NEXT: [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
		; AVX2-NEXT: [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
		; AVX2-NEXT: [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
		; AVX2-NEXT: [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
		; AVX2-NEXT: [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
		; AVX2-NEXT: [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
		; AVX2-NEXT: [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
		; AVX2-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
		; AVX2-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
		; AVX2-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
		; AVX2-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
		; AVX2-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
		; AVX2-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
		; AVX2-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
		; AVX2-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
		; AVX2-NEXT: ret void
		;
		; AVX512-LABEL: @sub_v8i64(
		; AVX512-NEXT: [[TMP1:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
		; AVX512-NEXT: [[TMP2:%.]] = load <8 x i64>, <8 x i64> bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
		; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
		; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
		; AVX512-NEXT: ret void
		;
		; AVX256BW-LABEL: @sub_v8i64(
		; AVX256BW-NEXT: [[TMP1:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
		; AVX256BW-NEXT: [[TMP2:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
		; AVX256BW-NEXT: [[TMP3:%.]] = load <4 x i64>, <4 x i64> bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
		; AVX256BW-NEXT: [[TMP4:%.]] = load <4 x i64>, <4 x i64> bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
		; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
		; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
		; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
		; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
		; AVX256BW-NEXT: ret void
;		;
%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8		%a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8		%a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8		%a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8		%a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8		%a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8		%a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8		%a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
Show All 21 Lines	;
store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8		store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8		store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8		store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8		store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
ret void		ret void
}		}

define void @sub_v16i32() {		define void @sub_v16i32() {
; CHECK-LABEL: @sub_v16i32(		; SSE-LABEL: @sub_v16i32(
; CHECK-NEXT: [[A0:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4		; SSE-NEXT: [[A0:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
; CHECK-NEXT: [[A1:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4		; SSE-NEXT: [[A1:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
; CHECK-NEXT: [[A2:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4		; SSE-NEXT: [[A2:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
; CHECK-NEXT: [[A3:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4		; SSE-NEXT: [[A3:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
; CHECK-NEXT: [[A4:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4		; SSE-NEXT: [[A4:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
; CHECK-NEXT: [[A5:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4		; SSE-NEXT: [[A5:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
; CHECK-NEXT: [[A6:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4		; SSE-NEXT: [[A6:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
; CHECK-NEXT: [[A7:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4		; SSE-NEXT: [[A7:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
; CHECK-NEXT: [[A8:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4		; SSE-NEXT: [[A8:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
; CHECK-NEXT: [[A9:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4		; SSE-NEXT: [[A9:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
; CHECK-NEXT: [[A10:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4		; SSE-NEXT: [[A10:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
; CHECK-NEXT: [[A11:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4		; SSE-NEXT: [[A11:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
; CHECK-NEXT: [[A12:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4		; SSE-NEXT: [[A12:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
; CHECK-NEXT: [[A13:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4		; SSE-NEXT: [[A13:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
; CHECK-NEXT: [[A14:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4		; SSE-NEXT: [[A14:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
; CHECK-NEXT: [[A15:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4		; SSE-NEXT: [[A15:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
; CHECK-NEXT: [[B0:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4		; SSE-NEXT: [[B0:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
; CHECK-NEXT: [[B1:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4		; SSE-NEXT: [[B1:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
; CHECK-NEXT: [[B2:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4		; SSE-NEXT: [[B2:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
; CHECK-NEXT: [[B3:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4		; SSE-NEXT: [[B3:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
; CHECK-NEXT: [[B4:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4		; SSE-NEXT: [[B4:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
; CHECK-NEXT: [[B5:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4		; SSE-NEXT: [[B5:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
; CHECK-NEXT: [[B6:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4		; SSE-NEXT: [[B6:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
; CHECK-NEXT: [[B7:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4		; SSE-NEXT: [[B7:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
; CHECK-NEXT: [[B8:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4		; SSE-NEXT: [[B8:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
; CHECK-NEXT: [[B9:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4		; SSE-NEXT: [[B9:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
; CHECK-NEXT: [[B10:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4		; SSE-NEXT: [[B10:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
; CHECK-NEXT: [[B11:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4		; SSE-NEXT: [[B11:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
; CHECK-NEXT: [[B12:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4		; SSE-NEXT: [[B12:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
; CHECK-NEXT: [[B13:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4		; SSE-NEXT: [[B13:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
; CHECK-NEXT: [[B14:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4		; SSE-NEXT: [[B14:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
; CHECK-NEXT: [[B15:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4		; SSE-NEXT: [[B15:%.]] = load i32, i32 getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A0]], i32 [[B0]])		; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A0]], i32 [[B0]])
; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A1]], i32 [[B1]])		; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A1]], i32 [[B1]])
; CHECK-NEXT: [[R2:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A2]], i32 [[B2]])		; SSE-NEXT: [[R2:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A2]], i32 [[B2]])
; CHECK-NEXT: [[R3:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A3]], i32 [[B3]])		; SSE-NEXT: [[R3:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A3]], i32 [[B3]])
; CHECK-NEXT: [[R4:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A4]], i32 [[B4]])		; SSE-NEXT: [[R4:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A4]], i32 [[B4]])
; CHECK-NEXT: [[R5:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A5]], i32 [[B5]])		; SSE-NEXT: [[R5:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A5]], i32 [[B5]])
; CHECK-NEXT: [[R6:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A6]], i32 [[B6]])		; SSE-NEXT: [[R6:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A6]], i32 [[B6]])
; CHECK-NEXT: [[R7:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A7]], i32 [[B7]])		; SSE-NEXT: [[R7:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A7]], i32 [[B7]])
; CHECK-NEXT: [[R8:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A8]], i32 [[B8]])		; SSE-NEXT: [[R8:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A8]], i32 [[B8]])
; CHECK-NEXT: [[R9:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A9]], i32 [[B9]])		; SSE-NEXT: [[R9:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A9]], i32 [[B9]])
; CHECK-NEXT: [[R10:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A10]], i32 [[B10]])		; SSE-NEXT: [[R10:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A10]], i32 [[B10]])
; CHECK-NEXT: [[R11:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A11]], i32 [[B11]])		; SSE-NEXT: [[R11:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A11]], i32 [[B11]])
; CHECK-NEXT: [[R12:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A12]], i32 [[B12]])		; SSE-NEXT: [[R12:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A12]], i32 [[B12]])
; CHECK-NEXT: [[R13:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A13]], i32 [[B13]])		; SSE-NEXT: [[R13:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A13]], i32 [[B13]])
; CHECK-NEXT: [[R14:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A14]], i32 [[B14]])		; SSE-NEXT: [[R14:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A14]], i32 [[B14]])
; CHECK-NEXT: [[R15:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A15]], i32 [[B15]])		; SSE-NEXT: [[R15:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A15]], i32 [[B15]])
; CHECK-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4		; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
; CHECK-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4		; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
; CHECK-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4		; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
; CHECK-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4		; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
; CHECK-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4		; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
; CHECK-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4		; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
; CHECK-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4		; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
; CHECK-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4		; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
; CHECK-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4		; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
; CHECK-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4		; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
; CHECK-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4		; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
; CHECK-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4		; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
; CHECK-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4		; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
; CHECK-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4		; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
; CHECK-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4		; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
; CHECK-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4		; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
; CHECK-NEXT: ret void		; SSE-NEXT: ret void
		;
		; SLM-LABEL: @sub_v16i32(
		; SLM-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP3:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP4:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP5:%.]] = load <4 x i32>, <4 x i32> bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP6:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP7:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP8:%.]] = load <4 x i32>, <4 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
		; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
		; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
		; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
		; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
		; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
		; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
		; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
		; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
		; SLM-NEXT: ret void
		;
		; AVX-LABEL: @sub_v16i32(
		; AVX-NEXT: [[TMP1:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
		; AVX-NEXT: [[TMP2:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
		; AVX-NEXT: [[TMP3:%.]] = load <8 x i32>, <8 x i32> bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
		; AVX-NEXT: [[TMP4:%.]] = load <8 x i32>, <8 x i32> bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
		; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
		; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
		; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
		; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
		; AVX-NEXT: ret void
		;
		; AVX512-LABEL: @sub_v16i32(
		; AVX512-NEXT: [[TMP1:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
		; AVX512-NEXT: [[TMP2:%.]] = load <16 x i32>, <16 x i32> bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
		; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
		; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
		; AVX512-NEXT: ret void
;		;
%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4		%a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4		%a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4		%a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4		%a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4		%a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4		%a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4		%a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
▲ Show 20 Lines • Show All 530 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[CodeGen][X86] Expand vector USUBSAT to UMAX+SUB
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 181820

llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp

llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp

llvm/trunk/test/Analysis/CostModel/X86/arith-usat.ll

llvm/trunk/test/CodeGen/X86/usub_sat.ll

llvm/trunk/test/CodeGen/X86/usub_sat_vec.ll

llvm/trunk/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll

This is an archive of the discontinued LLVM Phabricator instance.

[CodeGen][X86] Expand vector USUBSAT to UMAX+SUBClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 181820

llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp

llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp

llvm/trunk/test/Analysis/CostModel/X86/arith-usat.ll

llvm/trunk/test/CodeGen/X86/usub_sat.ll

llvm/trunk/test/CodeGen/X86/usub_sat_vec.ll

llvm/trunk/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll

[CodeGen][X86] Expand vector USUBSAT to UMAX+SUB
ClosedPublic