Diff 9582

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,406 Lines • ▼ Show 20 Lines	SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {

assert(VT != MVT::v4i32 && "unsupported shuffle type");		assert(VT != MVT::v4i32 && "unsupported shuffle type");

// Invert the operand order and use SHUFPS to match it.		// Invert the operand order and use SHUFPS to match it.
return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,		return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
getShuffleSHUFImmediate(SVOp), DAG);		getShuffleSHUFImmediate(SVOp), DAG);
}		}

		static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
		SelectionDAG &DAG) {
		SDLoc dl(Load);
		MVT VT = Load->getSimpleValueType(0);
		MVT EVT = VT.getVectorElementType();
		SDValue Addr = Load->getOperand(1);
		SDValue NewAddr = DAG.getNode(
		ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
		DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));

		SDValue NewLoad =
		DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
		DAG.getMachineFunction().getMachineMemOperand(
		Load->getMemOperand(), 0, EVT.getStoreSize()));
		return NewLoad;
		}

// It is only safe to call this function if isINSERTPSMask is true for		// It is only safe to call this function if isINSERTPSMask is true for
// this shufflevector mask.		// this shufflevector mask.
static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,		static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
// Generate an insertps instruction when inserting an f32 from memory onto a		// Generate an insertps instruction when inserting an f32 from memory onto a
// v4f32 or when copying a member from one v4f32 to another.		// v4f32 or when copying a member from one v4f32 to another.
// We also use it for transferring i32 from one register to another,		// We also use it for transferring i32 from one register to another,
// since it simply copies the same bits.		// since it simply copies the same bits.
// If we're transferring an i32 from memory to a specific element in a		// If we're transferring an i32 from memory to a specific element in a
// register, we output a generic DAG that will match the PINSRD		// register, we output a generic DAG that will match the PINSRD
// instruction.		// instruction.
// TODO: Optimize for AVX cases too (VINSERTPS)
MVT VT = SVOp->getSimpleValueType(0);		MVT VT = SVOp->getSimpleValueType(0);
MVT EVT = VT.getVectorElementType();		MVT EVT = VT.getVectorElementType();
SDValue V1 = SVOp->getOperand(0);		SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);		SDValue V2 = SVOp->getOperand(1);
auto Mask = SVOp->getMask();		auto Mask = SVOp->getMask();
assert((VT == MVT::v4f32 \|\| VT == MVT::v4i32) &&		assert((VT == MVT::v4f32 \|\| VT == MVT::v4i32) &&
"unsupported vector type for insertps/pinsrd");		"unsupported vector type for insertps/pinsrd");

Show All 16 Lines	DestIndex = std::find_if(Mask.begin(), Mask.end(),
[](const int &i) { return i >= 4; }) -		[](const int &i) { return i >= 4; }) -
Mask.begin();		Mask.begin();
}		}

if (MayFoldLoad(From)) {		if (MayFoldLoad(From)) {
// Trivial case, when From comes from a load and is only used by the		// Trivial case, when From comes from a load and is only used by the
// shuffle. Make it use insertps from the vector that we need from that		// shuffle. Make it use insertps from the vector that we need from that
// load.		// load.
SDValue Addr = From.getOperand(1);
SDValue NewAddr =
DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
DAG.getConstant(DestIndex * EVT.getStoreSize(),
Addr.getSimpleValueType()));

LoadSDNode *Load = cast<LoadSDNode>(From);
SDValue NewLoad =		SDValue NewLoad =
DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,		NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
DAG.getMachineFunction().getMachineMemOperand(		if (!NewLoad.getNode())
Load->getMemOperand(), 0, EVT.getStoreSize()));		return SDValue();

if (EVT == MVT::f32) {		if (EVT == MVT::f32) {
// Create this as a scalar to vector to match the instruction pattern.		// Create this as a scalar to vector to match the instruction pattern.
SDValue LoadScalarToVector =		SDValue LoadScalarToVector =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);		DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);		SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,		return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
InsertpsMask);		InsertpsMask);
▲ Show 20 Lines • Show All 12,798 Lines • ▼ Show 20 Lines	if (IsSEXT1 && IsVZero0) {
return DAG.getNOT(DL, RHS.getOperand(0), VT);		return DAG.getNOT(DL, RHS.getOperand(0), VT);
return RHS.getOperand(0);		return RHS.getOperand(0);
}		}
}		}

return SDValue();		return SDValue();
}		}

		static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
		const X86Subtarget *Subtarget) {
		SDLoc dl(N);
		MVT VT = N->getOperand(1)->getSimpleValueType(0);
		assert(VT == MVT::v4f32 \|\|
		VT == MVT::v4i32 && "X86insertps is only defined for v4x32");

		SDValue Ld = N->getOperand(1);
		if (MayFoldLoad(Ld)) {
		// Extract the countS bits from the immediate so we can get the proper
		// address when narrowing the vector load to a specific element.
		// When the second source op is a memory address, interps doesn't use
		// countS and just gets an f32 from that address.
		unsigned DestIndex =
		cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
		Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
		} else
		return SDValue();

		// Create this as a scalar to vector to match the instruction pattern.
		SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
		// countS bits are ignored when loading from memory on insertps, which
		// means we don't need to explicitly set them to 0.
		return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
		LoadScalarToVector, N->getOperand(2));
		}

// Helper function of PerformSETCCCombine. It is to materialize "setb reg"		// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
// as "sbb reg,reg", since it can be extended without zext and produces		// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.		// an all-ones bit which is more useful than 0/1 in some cases.
static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,		static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
MVT VT) {		MVT VT) {
if (VT == MVT::i8)		if (VT == MVT::i8)
return DAG.getNode(ISD::AND, DL, VT,		return DAG.getNode(ISD::AND, DL, VT,
DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,		DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
▲ Show 20 Lines • Show All 287 Lines • ▼ Show 20 Lines	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::MOVSS:		case X86ISD::MOVSS:
case X86ISD::MOVSD:		case X86ISD::MOVSD:
case X86ISD::VPERMILP:		case X86ISD::VPERMILP:
case X86ISD::VPERM2X128:		case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);		case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);		case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:		case ISD::INTRINSIC_WO_CHAIN:
return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);		return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
		case X86ISD::INSERTPS:
		return PerformINSERTPSCombine(N, DAG, Subtarget);
}		}

return SDValue();		return SDValue();
}		}

/// isTypeDesirableForOp - Return true if the target has native support for		/// isTypeDesirableForOp - Return true if the target has native support for
/// the specified value type and it is 'desirable' to use the type for the		/// the specified value type and it is 'desirable' to use the type for the
/// given node type. e.g. On x86 i16 is legal, but undesirable since i16		/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
▲ Show 20 Lines • Show All 775 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86InstrSSE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 6,544 Lines • ▼ Show 20 Lines

	let ExeDomain = SSEPackedSingle in {			let ExeDomain = SSEPackedSingle in {
	let Predicates = [UseAVX] in			let Predicates = [UseAVX] in
	defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;			defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
	let Constraints = "$src1 = $dst" in			let Constraints = "$src1 = $dst" in
	defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;			defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
	}			}

				let Predicates = [UseSSE41] in {
				// If we're inserting an element from a load or a null pshuf of a load,
				// fold the load into the insertps instruction.
				def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
				(scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
				imm:$src3)),
				(INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
				def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
				(loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
				(INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
				}

				let Predicates = [UseAVX] in {
				// If we're inserting an element from a vbroadcast of a load, fold the
				// load into the X86insertps instruction.
				def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
				(X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
				(VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
				def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
				(X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
				(VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
				}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// SSE4.1 - Round Instructions			// SSE4.1 - Round Instructions
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,			multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
	X86MemOperand x86memop, RegisterClass RC,			X86MemOperand x86memop, RegisterClass RC,
	PatFrag mem_frag32, PatFrag mem_frag64,			PatFrag mem_frag32, PatFrag mem_frag64,
	Intrinsic V4F32Int, Intrinsic V2F64Int> {			Intrinsic V4F32Int, Intrinsic V2F64Int> {
	▲ Show 20 Lines • Show All 2,298 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/avx.ll

	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx \| FileCheck %s			; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx \| FileCheck %s -check-prefix=X32 --check-prefix=CHECK
				; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx \| FileCheck %s -check-prefix=X64 --check-prefix=CHECK

	define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {			define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
	; CHECK-LABEL: @blendvb_fallback_v4i32			; CHECK-LABEL: @blendvb_fallback_v4i32
	; CHECK: vblendvps			; CHECK: vblendvps
	; CHECK: ret			; CHECK: ret
	%ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y			%ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
	ret <4 x i32> %ret			ret <4 x i32> %ret
	}			}

	define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {			define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
	; CHECK-LABEL: @blendvb_fallback_v8i32			; CHECK-LABEL: @blendvb_fallback_v8i32
	; CHECK: vblendvps			; CHECK: vblendvps
	; CHECK: ret			; CHECK: ret
	%ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y			%ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
	ret <8 x i32> %ret			ret <8 x i32> %ret
	}			}

	define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {			define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
	; CHECK-LABEL: @blendvb_fallback_v8f32			; CHECK-LABEL: @blendvb_fallback_v8f32
	; CHECK: vblendvps			; CHECK: vblendvps
	; CHECK: ret			; CHECK: ret
	%ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y			%ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
	ret <8 x float> %ret			ret <8 x float> %ret
	}			}

				declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone

				define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
				; CHECK-LABEL: insertps_from_vector_load:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %pb, align 16
				%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
				ret <4 x float> %2
				}

				;; Use a non-zero CountS for insertps
				define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
				; CHECK-LABEL: insertps_from_vector_load_offset:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; CHECK-NOT: mov
				;; Try to match a bit more of the instr, since we need the load's offset.
				; CHECK: insertps $96, 4(%{{...}}), %
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %pb, align 16
				%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
				ret <4 x float> %2
				}

				define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
				; CHECK-LABEL: insertps_from_vector_load_offset_2:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; X32: movl 8(%esp), %ecx
				; CHECK-NOT: mov
				;; Try to match a bit more of the instr, since we need the load's offset.
				; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), %
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds <4 x float>* %pb, i64 %index
				%2 = load <4 x float>* %1, align 16
				%3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
				ret <4 x float> %3
				}

				define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
				; CHECK-LABEL: insertps_from_broadcast_loadf32:
				; On X32, account for the arguments' move to registers
				; X32: movl 8(%esp), %eax
				; X32: movl 4(%esp), %ecx
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds float* %fb, i64 %index
				%2 = load float* %1, align 4
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				ret <4 x float> %7
				}

				define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
				; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
				; On X32, account for the arguments' move to registers
				; X32: movl 4(%esp), %{{...}}
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %b, align 4
				%2 = extractelement <4 x float> %1, i32 0
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				ret <4 x float> %7
				}

				;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
				define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
				; CHECK-LABEL: insertps_from_broadcast_multiple_use:
				; On X32, account for the arguments' move to registers
				; X32: movl 8(%esp), %eax
				; X32: movl 4(%esp), %ecx
				; CHECK: vbroadcastss
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: vaddps
				; CHECK: vaddps
				; CHECK: vaddps
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds float* %fb, i64 %index
				%2 = load float* %1, align 4
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				%8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
				%9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
				%10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
				%11 = fadd <4 x float> %7, %8
				%12 = fadd <4 x float> %9, %10
				%13 = fadd <4 x float> %11, %12
				ret <4 x float> %13
				}

llvm/trunk/test/CodeGen/X86/fold-load-vec.ll

	; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1 \| FileCheck %s			; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1 \| FileCheck %s

	; rdar://12721174			; rdar://12721174
	; We should not fold movss into pshufd since pshufd expects m128 while movss			; We should not fold movss into pshufd since pshufd expects m128 while movss
	; loads from m32.			; loads from m32.
	define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {			define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
	; CHECK: sample_test			; CHECK: sample_test
	; CHECK: movaps			; CHECK-NOT: movaps
	; CHECK: insertps			; CHECK: insertps
	entry:			entry:
	%source.addr = alloca <4 x float>*, align 8			%source.addr = alloca <4 x float>*, align 8
	%dest.addr = alloca <2 x float>*, align 8			%dest.addr = alloca <2 x float>*, align 8
	%tmp = alloca <2 x float>, align 8			%tmp = alloca <2 x float>, align 8
	store <4 x float>* %source, <4 x float>** %source.addr, align 8			store <4 x float>* %source, <4 x float>** %source.addr, align 8
	store <2 x float>* %dest, <2 x float>** %dest.addr, align 8			store <2 x float>* %dest, <2 x float>** %dest.addr, align 8
	store <2 x float> zeroinitializer, <2 x float>* %tmp, align 8			store <2 x float> zeroinitializer, <2 x float>* %tmp, align 8
	Show All 23 Lines

llvm/trunk/test/CodeGen/X86/sse41.ll

	Show First 20 Lines • Show All 578 Lines • ▼ Show 20 Lines

	define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {			define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
	; CHECK-LABEL: blendvb_fallback			; CHECK-LABEL: blendvb_fallback
	; CHECK: blendvb			; CHECK: blendvb
	; CHECK: ret			; CHECK: ret
	%ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y			%ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
	ret <8 x i16> %ret			ret <8 x i16> %ret
	}			}

				define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
				; CHECK-LABEL: insertps_from_vector_load:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %pb, align 16
				%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
				ret <4 x float> %2
				}

				;; Use a non-zero CountS for insertps
				define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
				; CHECK-LABEL: insertps_from_vector_load_offset:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; CHECK-NOT: mov
				;; Try to match a bit more of the instr, since we need the load's offset.
				; CHECK: insertps $96, 4(%{{...}}), %
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %pb, align 16
				%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
				ret <4 x float> %2
				}

				define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
				; CHECK-LABEL: insertps_from_vector_load_offset_2:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; X32: movl 8(%esp), %ecx
				; CHECK-NOT: mov
				;; Try to match a bit more of the instr, since we need the load's offset.
				; CHECK: insertps $192, 12(%{{...}},%{{...}}), %
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds <4 x float>* %pb, i64 %index
				%2 = load <4 x float>* %1, align 16
				%3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
				ret <4 x float> %3
				}

				define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
				; CHECK-LABEL: insertps_from_broadcast_loadf32:
				; On X32, account for the arguments' move to registers
				; X32: movl 8(%esp), %eax
				; X32: movl 4(%esp), %ecx
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds float* %fb, i64 %index
				%2 = load float* %1, align 4
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				ret <4 x float> %7
				}

				define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
				; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
				; On X32, account for the arguments' move to registers
				; X32: movl 4(%esp), %{{...}}
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %b, align 4
				%2 = extractelement <4 x float> %1, i32 0
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				ret <4 x float> %7
				}

				;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
				define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
				; CHECK-LABEL: insertps_from_broadcast_multiple_use:
				; On X32, account for the arguments' move to registers
				; X32: movl 8(%esp), %eax
				; X32: movl 4(%esp), %ecx
				; CHECK: movss
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: addps
				; CHECK: addps
				; CHECK: addps
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds float* %fb, i64 %index
				%2 = load float* %1, align 4
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				%8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
				%9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
				%10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
				%11 = fadd <4 x float> %7, %8
				%12 = fadd <4 x float> %9, %10
				%13 = fadd <4 x float> %11, %12
				ret <4 x float> %13
				}

This is an archive of the discontinued LLVM Phabricator instance.

Added more insertps optimizations
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 9582

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

llvm/trunk/lib/Target/X86/X86InstrSSE.td

llvm/trunk/test/CodeGen/X86/avx.ll

llvm/trunk/test/CodeGen/X86/fold-load-vec.ll

llvm/trunk/test/CodeGen/X86/sse41.ll

This is an archive of the discontinued LLVM Phabricator instance.

Added more insertps optimizationsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 9582

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

llvm/trunk/lib/Target/X86/X86InstrSSE.td

llvm/trunk/test/CodeGen/X86/avx.ll

llvm/trunk/test/CodeGen/X86/fold-load-vec.ll

llvm/trunk/test/CodeGen/X86/sse41.ll

Added more insertps optimizations
ClosedPublic