Diff 138112

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,338 Lines • ▼ Show 20 Lines	if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);		setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);		setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);

if (Subtarget.hasDQI()) {		if (Subtarget.hasDQI()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);		setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);		setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);		setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);		setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);

		setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}		}

if (Subtarget.hasCDI()) {		if (Subtarget.hasCDI()) {
// NonVLX sub-targets extend 128/256 vectors to use the 512 version.		// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
for (auto VT : { MVT::v16i32, MVT::v8i64} ) {		for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
setOperationAction(ISD::CTLZ, VT, Legal);		setOperationAction(ISD::CTLZ, VT, Legal);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);		setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
}		}
▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
setOperationAction(ISD::MSCATTER, VT, Custom);		setOperationAction(ISD::MSCATTER, VT, Custom);

if (Subtarget.hasDQI()) {		if (Subtarget.hasDQI()) {
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {		for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SINT_TO_FP, VT, Legal);		setOperationAction(ISD::SINT_TO_FP, VT, Legal);
setOperationAction(ISD::UINT_TO_FP, VT, Legal);		setOperationAction(ISD::UINT_TO_FP, VT, Legal);
setOperationAction(ISD::FP_TO_SINT, VT, Legal);		setOperationAction(ISD::FP_TO_SINT, VT, Legal);
setOperationAction(ISD::FP_TO_UINT, VT, Legal);		setOperationAction(ISD::FP_TO_UINT, VT, Legal);

		setOperationAction(ISD::MUL, VT, Legal);
}		}
}		}

if (Subtarget.hasCDI()) {		if (Subtarget.hasCDI()) {
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {		for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::CTLZ, VT, Legal);		setOperationAction(ISD::CTLZ, VT, Legal);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);		setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
}		}
▲ Show 20 Lines • Show All 3,657 Lines • ▼ Show 20 Lines
// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in		// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for		// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
// deciding if/how to split Ops. Ops elements do not have to be of type VT.		// deciding if/how to split Ops. Ops elements do not have to be of type VT.
// The argument Builder is a function that will be applied on each split part:		// The argument Builder is a function that will be applied on each split part:
// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)		// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
template <typename F>		template <typename F>
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,		SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,		const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
F Builder) {		F Builder, bool CheckBWI = true) {
assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");		assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
unsigned NumSubs = 1;		unsigned NumSubs = 1;
if (Subtarget.useBWIRegs()) {		if ((CheckBWI && Subtarget.useBWIRegs()) \|\|
		(!CheckBWI && Subtarget.useAVX512Regs())) {
if (VT.getSizeInBits() > 512) {		if (VT.getSizeInBits() > 512) {
NumSubs = VT.getSizeInBits() / 512;		NumSubs = VT.getSizeInBits() / 512;
assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");		assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
}		}
} else if (Subtarget.hasAVX2()) {		} else if (Subtarget.hasAVX2()) {
if (VT.getSizeInBits() > 256) {		if (VT.getSizeInBits() > 256) {
NumSubs = VT.getSizeInBits() / 256;		NumSubs = VT.getSizeInBits() / 256;
assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");		assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
Show All 22 Lines	SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);		return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
}		}

// Helper for splitting operands of a binary operation to legal target size and		// Helper for splitting operands of a binary operation to legal target size and
// apply a function on each part.		// apply a function on each part.
template <typename F>		template <typename F>
SDValue SplitBinaryOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,		SDValue SplitBinaryOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
const SDLoc &DL, EVT VT, SDValue Op0,		const SDLoc &DL, EVT VT, SDValue Op0,
SDValue Op1, F Builder) {		SDValue Op1, F Builder, bool CheckBWI = true) {
SDValue Ops[] = {Op0, Op1};		SDValue Ops[] = {Op0, Op1};
return SplitOpsAndApply(DAG, Subtarget, DL, VT, makeArrayRef(Ops), Builder);		return SplitOpsAndApply(DAG, Subtarget, DL, VT, makeArrayRef(Ops), Builder,
		CheckBWI);
}		}

// Return true if the instruction zeroes the unused upper part of the		// Return true if the instruction zeroes the unused upper part of the
// destination and accepts mask.		// destination and accepts mask.
static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {		static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
switch (Opcode) {		switch (Opcode) {
default:		default:
return false;		return false;
▲ Show 20 Lines • Show All 17,349 Lines • ▼ Show 20 Lines	if (VT == MVT::v4i32) {
// Merge the two vectors back together with a shuffle. This expands into 2		// Merge the two vectors back together with a shuffle. This expands into 2
// shuffles.		// shuffles.
static const int ShufMask[] = { 0, 4, 2, 6 };		static const int ShufMask[] = { 0, 4, 2, 6 };
return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);		return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
}		}

assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&		assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply");		"Only know how to lower V2I64/V4I64/V8I64 multiply");
		assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
// MULDQ returns the 64-bit result of the signed multiplication of the lower
// 32-bits. We can lower with this if the sign bits stretch that far.
if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
DAG.ComputeNumSignBits(B) > 32) {
return DAG.getNode(X86ISD::PMULDQ, dl, VT, A, B);
}

// Ahi = psrlqi(a, 32);		// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);		// Bhi = psrlqi(b, 32);
//		//
// AloBlo = pmuludq(a, b);		// AloBlo = pmuludq(a, b);
// AloBhi = pmuludq(a, Bhi);		// AloBhi = pmuludq(a, Bhi);
// AhiBlo = pmuludq(Ahi, b);		// AhiBlo = pmuludq(Ahi, b);
//		//
// Hi = psllqi(AloBhi + AhiBlo, 32);		// Hi = psllqi(AloBhi + AhiBlo, 32);
// return AloBlo + Hi;		// return AloBlo + Hi;
APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);		APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);		bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);		bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);

APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);		APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);		bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);		bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);

// If DQI is supported we can use MULLQ, but MULUDQ is still better if the
// the high bits are known to be zero.
if (Subtarget.hasDQI() && (!AHiIsZero \|\| !BHiIsZero))
return Op;

SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);		SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);

// Only multiply lo/hi halves that aren't known to be zero.		// Only multiply lo/hi halves that aren't known to be zero.
SDValue AloBlo = Zero;		SDValue AloBlo = Zero;
if (!ALoIsZero && !BLoIsZero)		if (!ALoIsZero && !BLoIsZero)
AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);		AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

SDValue AloBhi = Zero;		SDValue AloBhi = Zero;
▲ Show 20 Lines • Show All 10,528 Lines • ▼ Show 20 Lines	auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);		MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);		return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
};		};
return SplitBinaryOpsAndApply(DAG, Subtarget, SDLoc(N), VT,		return SplitBinaryOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
DAG.getBitcast(WVT, N0),		DAG.getBitcast(WVT, N0),
DAG.getBitcast(WVT, N1), PMADDWDBuilder);		DAG.getBitcast(WVT, N1), PMADDWDBuilder);
}		}

		static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
		const X86Subtarget &Subtarget) {
		if (!Subtarget.hasSSE2())
		return SDValue();

		EVT VT = N->getValueType(0);

		// Only support vXi64 vectors.
		if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i64 \|\|
		!DAG.getTargetLoweringInfo().isTypeLegal(VT))
		return SDValue();

		SDValue N0 = N->getOperand(0);
		SDValue N1 = N->getOperand(1);

		// MULDQ returns the 64-bit result of the signed multiplication of the lower
		// 32-bits. We can lower with this if the sign bits stretch that far.
		if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
		DAG.ComputeNumSignBits(N1) > 32) {
		auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
		ArrayRef<SDValue> Ops) {
		return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
		};
		return SplitBinaryOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
		N0, N1, PMULDQBuilder, /CheckBWI/false);
		}

		// If the upper bits are zero we can use a single pmuludq.
		APInt Mask = APInt::getHighBitsSet(64, 32);
		if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
		auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
		RKSimonUnsubmitted Not Done Reply Inline Actions PMULUDQBuilder ? RKSimon: PMULUDQBuilder ?
		RKSimonUnsubmitted Not Done Reply Inline Actions PMULUDQBuilder ? RKSimon: PMULUDQBuilder ?
		ArrayRef<SDValue> Ops) {
		return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
		};
		return SplitBinaryOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
		N0, N1, PMULDQBuilder, /CheckBWI/false);
		}

		return SDValue();
		}

/// Optimize a single multiply with constant into two operations in order to		/// Optimize a single multiply with constant into two operations in order to
/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.		/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
static SDValue combineMul(SDNode *N, SelectionDAG &DAG,		static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);

if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))		if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
return V;		return V;

		if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
		return V;

if (DCI.isBeforeLegalize() && VT.isVector())		if (DCI.isBeforeLegalize() && VT.isVector())
return reduceVMULWidth(N, DAG, Subtarget);		return reduceVMULWidth(N, DAG, Subtarget);

if (!MulConstantOptimization)		if (!MulConstantOptimization)
return SDValue();		return SDValue();
// An imul is usually smaller than the alternative sequence.		// An imul is usually smaller than the alternative sequence.
if (DAG.getMachineFunction().getFunction().optForMinSize())		if (DAG.getMachineFunction().getFunction().optForMinSize())
return SDValue();		return SDValue();
▲ Show 20 Lines • Show All 2,586 Lines • ▼ Show 20 Lines	if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
return TruncateArithmetic(Op0, Op1);		return TruncateArithmetic(Op0, Op1);
break;		break;
}		}

case ISD::MUL:		case ISD::MUL:
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its		// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
// better to truncate if we have the chance.		// better to truncate if we have the chance.
if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&		if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
!Subtarget.hasDQI())		!TLI.isOperationLegal(Opcode, SrcVT))
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));		return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;		LLVM_FALLTHROUGH;
case ISD::ADD: {		case ISD::ADD: {
// TODO: ISD::SUB should be here but interferes with combineSubToSubus.		// TODO: ISD::SUB should be here but interferes with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);		SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);		SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(Opcode, VT) &&		if (TLI.isOperationLegal(Opcode, VT) &&
IsRepeatedOpOrFreeTruncation(Op0, Op1))		IsRepeatedOpOrFreeTruncation(Op0, Op1))
▲ Show 20 Lines • Show All 3,927 Lines • Show Last 20 Lines

test/CodeGen/X86/mulvi32.ll

	Show First 20 Lines • Show All 161 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: pmuludq %xmm4, %xmm2			; SSE2-NEXT: pmuludq %xmm4, %xmm2
	; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]			; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
	; SSE2-NEXT: pmuludq %xmm1, %xmm0			; SSE2-NEXT: pmuludq %xmm1, %xmm0
	; SSE2-NEXT: movdqa %xmm2, %xmm1			; SSE2-NEXT: movdqa %xmm2, %xmm1
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSE42-LABEL: _mul4xi32toi64a:			; SSE42-LABEL: _mul4xi32toi64a:
	; SSE42: # %bb.0:			; SSE42: # %bb.0:
	; SSE42-NEXT: pxor %xmm3, %xmm3			; SSE42-NEXT: pxor %xmm3, %xmm3
				craig.topperAuthorUnsubmitted Not Done Reply Inline Actions These shuffles are moving the high elements down so we can zero extend. The original code used a punpck with zero instead. craig.topper: These shuffles are moving the high elements down so we can zero extend. The original code used…
	; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero			; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
	; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]			; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
	; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero			; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
	; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]			; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
	; SSE42-NEXT: pmuludq %xmm0, %xmm1			; SSE42-NEXT: pmuludq %xmm0, %xmm1
	; SSE42-NEXT: pmuludq %xmm4, %xmm2			; SSE42-NEXT: pmuludq %xmm4, %xmm2
	; SSE42-NEXT: movdqa %xmm2, %xmm0			; SSE42-NEXT: movdqa %xmm2, %xmm0
	; SSE42-NEXT: retq			; SSE42-NEXT: retq
	;			;
	; AVX1-LABEL: _mul4xi32toi64a:			; AVX1-LABEL: _mul4xi32toi64a:
	; AVX1: # %bb.0:			; AVX1: # %bb.0:
	; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2			; AVX1-NEXT: vmovd %xmm0, %r8d
				craig.topperAuthorUnsubmitted Not Done Reply Inline Actions I think this is simplify demanded bits on pmuldq kicking in to remove the zeros going into elements 1 and 3. So they are effectively garbage. craig.topper: I think this is simplify demanded bits on pmuldq kicking in to remove the zeros going into…
	; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]			; AVX1-NEXT: vpextrd $1, %xmm0, %r9d
	; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]			; AVX1-NEXT: vpextrd $2, %xmm0, %edx
	; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2			; AVX1-NEXT: vpextrd $3, %xmm0, %esi
	; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero			; AVX1-NEXT: vmovd %xmm1, %r10d
	; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero			; AVX1-NEXT: vpextrd $1, %xmm1, %eax
	; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0			; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0			; AVX1-NEXT: vpextrd $3, %xmm1, %edi
				; AVX1-NEXT: vmovq %rdi, %xmm0
				; AVX1-NEXT: vmovq %rcx, %xmm1
				; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
				; AVX1-NEXT: vmovq %rsi, %xmm1
				; AVX1-NEXT: vmovq %rdx, %xmm2
				; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
				; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
				; AVX1-NEXT: vmovq %rax, %xmm1
				; AVX1-NEXT: vmovq %r10, %xmm2
				; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
				; AVX1-NEXT: vmovq %r9, %xmm2
				; AVX1-NEXT: vmovq %r8, %xmm3
				; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
				; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm1
				; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: _mul4xi32toi64a:			; AVX2-LABEL: _mul4xi32toi64a:
	; AVX2: # %bb.0:			; AVX2: # %bb.0:
	; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2			; AVX2-NEXT: vmovd %xmm0, %r10d
	; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]			; AVX2-NEXT: vpextrd $1, %xmm0, %ecx
	; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero			; AVX2-NEXT: vpextrd $2, %xmm0, %edx
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0			; AVX2-NEXT: vpextrd $3, %xmm0, %esi
	; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]			; AVX2-NEXT: vmovd %xmm1, %r8d
	; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero			; AVX2-NEXT: vpextrd $1, %xmm1, %r9d
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1			; AVX2-NEXT: vpextrd $2, %xmm1, %edi
				; AVX2-NEXT: vpextrd $3, %xmm1, %eax
				; AVX2-NEXT: vmovq %rsi, %xmm0
				; AVX2-NEXT: vmovq %rdx, %xmm1
				; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
				; AVX2-NEXT: vmovq %rcx, %xmm1
				; AVX2-NEXT: vmovq %r10, %xmm2
				; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
				; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
				; AVX2-NEXT: vmovq %rax, %xmm1
				; AVX2-NEXT: vmovq %rdi, %xmm2
				; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
				; AVX2-NEXT: vmovq %r9, %xmm2
				; AVX2-NEXT: vmovq %r8, %xmm3
				; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
				; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
	; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0			; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%f00 = extractelement <4 x i32> %0, i32 0			%f00 = extractelement <4 x i32> %0, i32 0
	%f01 = extractelement <4 x i32> %0, i32 1			%f01 = extractelement <4 x i32> %0, i32 1
	%f02 = extractelement <4 x i32> %0, i32 2			%f02 = extractelement <4 x i32> %0, i32 2
	%f03 = extractelement <4 x i32> %0, i32 3			%f03 = extractelement <4 x i32> %0, i32 3
	%f10 = extractelement <4 x i32> %1, i32 0			%f10 = extractelement <4 x i32> %1, i32 0
	%f11 = extractelement <4 x i32> %1, i32 1			%f11 = extractelement <4 x i32> %1, i32 1
	▲ Show 20 Lines • Show All 206 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Combine vXi64 multiplies to MULDQ/MULUDQ during DAG combine instead of lowering.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 138112

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/mulvi32.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Combine vXi64 multiplies to MULDQ/MULUDQ during DAG combine instead of lowering.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 138112

lib/Target/X86/X86ISelLowering.cpp

test/CodeGen/X86/mulvi32.ll

[X86] Combine vXi64 multiplies to MULDQ/MULUDQ during DAG combine instead of lowering.
ClosedPublic