Diff 9503

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,406 Lines • ▼ Show 20 Lines	SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {

assert(VT != MVT::v4i32 && "unsupported shuffle type");		assert(VT != MVT::v4i32 && "unsupported shuffle type");

// Invert the operand order and use SHUFPS to match it.		// Invert the operand order and use SHUFPS to match it.
return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,		return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
getShuffleSHUFImmediate(SVOp), DAG);		getShuffleSHUFImmediate(SVOp), DAG);
}		}

		static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
		SelectionDAG &DAG) {
		SDLoc dl(Load);
		MVT VT = Load->getSimpleValueType(0);
		MVT EVT = VT.getVectorElementType();
		SDValue Addr = Load->getOperand(1);
		SDValue NewAddr = DAG.getNode(
		ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
		DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));

		SDValue NewLoad =
		DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
		DAG.getMachineFunction().getMachineMemOperand(
		Load->getMemOperand(), 0, EVT.getStoreSize()));
		return NewLoad;
		}

// It is only safe to call this function if isINSERTPSMask is true for		// It is only safe to call this function if isINSERTPSMask is true for
// this shufflevector mask.		// this shufflevector mask.
static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,		static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
// Generate an insertps instruction when inserting an f32 from memory onto a		// Generate an insertps instruction when inserting an f32 from memory onto a
// v4f32 or when copying a member from one v4f32 to another.		// v4f32 or when copying a member from one v4f32 to another.
// We also use it for transferring i32 from one register to another,		// We also use it for transferring i32 from one register to another,
// since it simply copies the same bits.		// since it simply copies the same bits.
// If we're transferring an i32 from memory to a specific element in a		// If we're transferring an i32 from memory to a specific element in a
// register, we output a generic DAG that will match the PINSRD		// register, we output a generic DAG that will match the PINSRD
// instruction.		// instruction.
// TODO: Optimize for AVX cases too (VINSERTPS)
MVT VT = SVOp->getSimpleValueType(0);		MVT VT = SVOp->getSimpleValueType(0);
MVT EVT = VT.getVectorElementType();		MVT EVT = VT.getVectorElementType();
SDValue V1 = SVOp->getOperand(0);		SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);		SDValue V2 = SVOp->getOperand(1);
auto Mask = SVOp->getMask();		auto Mask = SVOp->getMask();
assert((VT == MVT::v4f32 \|\| VT == MVT::v4i32) &&		assert((VT == MVT::v4f32 \|\| VT == MVT::v4i32) &&
"unsupported vector type for insertps/pinsrd");		"unsupported vector type for insertps/pinsrd");

Show All 16 Lines	DestIndex = std::find_if(Mask.begin(), Mask.end(),
[](const int &i) { return i >= 4; }) -		[](const int &i) { return i >= 4; }) -
Mask.begin();		Mask.begin();
}		}

if (MayFoldLoad(From)) {		if (MayFoldLoad(From)) {
// Trivial case, when From comes from a load and is only used by the		// Trivial case, when From comes from a load and is only used by the
// shuffle. Make it use insertps from the vector that we need from that		// shuffle. Make it use insertps from the vector that we need from that
// load.		// load.
SDValue Addr = From.getOperand(1);
SDValue NewAddr =
DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
DAG.getConstant(DestIndex * EVT.getStoreSize(),
Addr.getSimpleValueType()));

LoadSDNode *Load = cast<LoadSDNode>(From);
SDValue NewLoad =		SDValue NewLoad =
DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,		NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
DAG.getMachineFunction().getMachineMemOperand(		if (!NewLoad.getNode())
Load->getMemOperand(), 0, EVT.getStoreSize()));		return SDValue();

if (EVT == MVT::f32) {		if (EVT == MVT::f32) {
// Create this as a scalar to vector to match the instruction pattern.		// Create this as a scalar to vector to match the instruction pattern.
SDValue LoadScalarToVector =		SDValue LoadScalarToVector =
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);		DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);		SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,		return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
InsertpsMask);		InsertpsMask);
▲ Show 20 Lines • Show All 12,783 Lines • ▼ Show 20 Lines	if (IsSEXT1 && IsVZero0) {
return DAG.getNOT(DL, RHS.getOperand(0), VT);		return DAG.getNOT(DL, RHS.getOperand(0), VT);
return RHS.getOperand(0);		return RHS.getOperand(0);
}		}
}		}

return SDValue();		return SDValue();
}		}

		static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
		const X86Subtarget *Subtarget) {
		SDLoc dl(N);
		MVT VT = N->getOperand(1)->getSimpleValueType(0);
		assert(VT == MVT::v4f32 \|\|
		VT == MVT::v4i32 && "X86insertps is only defined for v4x32");

		SDValue Ld = N->getOperand(1);
		if (MayFoldLoad(Ld)) {
		unsigned DestIndex =
		andreadbUnsubmitted Not Done Reply Inline Actions It might be useful to have a comment here explaining why you need a shift. When the source is a memory operand, the Count_S bits of the immediate operand are not used to select the floating point element from the source memory location. That's why we have to extract the 'Count_S' bits from the immediate operand and use them as 'index' for a new load instruction. andreadb: It might be useful to have a comment here explaining why you need a shift. When the source is a…
		cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
		Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
		} else
		return SDValue();

		// Create this as a scalar to vector to match the instruction pattern.
		SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
		// countS bits are ignored when loading from memory on insertps, which
		// means we don't need to explicitly set them to 0.
		return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
		LoadScalarToVector, N->getOperand(2));
		}

// Helper function of PerformSETCCCombine. It is to materialize "setb reg"		// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
// as "sbb reg,reg", since it can be extended without zext and produces		// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.		// an all-ones bit which is more useful than 0/1 in some cases.
static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,		static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
MVT VT) {		MVT VT) {
if (VT == MVT::i8)		if (VT == MVT::i8)
return DAG.getNode(ISD::AND, DL, VT,		return DAG.getNode(ISD::AND, DL, VT,
DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,		DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
▲ Show 20 Lines • Show All 287 Lines • ▼ Show 20 Lines	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::MOVSS:		case X86ISD::MOVSS:
case X86ISD::MOVSD:		case X86ISD::MOVSD:
case X86ISD::VPERMILP:		case X86ISD::VPERMILP:
case X86ISD::VPERM2X128:		case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);		case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);		case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:		case ISD::INTRINSIC_WO_CHAIN:
return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);		return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
		case X86ISD::INSERTPS:
		return PerformINSERTPSCombine(N, DAG, Subtarget);
}		}

return SDValue();		return SDValue();
}		}

/// isTypeDesirableForOp - Return true if the target has native support for		/// isTypeDesirableForOp - Return true if the target has native support for
/// the specified value type and it is 'desirable' to use the type for the		/// the specified value type and it is 'desirable' to use the type for the
/// given node type. e.g. On x86 i16 is legal, but undesirable since i16		/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
▲ Show 20 Lines • Show All 775 Lines • Show Last 20 Lines

lib/Target/X86/X86InstrSSE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 6,544 Lines • ▼ Show 20 Lines

	let ExeDomain = SSEPackedSingle in {			let ExeDomain = SSEPackedSingle in {
	let Predicates = [UseAVX] in			let Predicates = [UseAVX] in
	defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;			defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
	let Constraints = "$src1 = $dst" in			let Constraints = "$src1 = $dst" in
	defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;			defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
	}			}

				let Predicates = [UseSSE41] in
				andreadbUnsubmitted Not Done Reply Inline Actions You forgot to enclose both patterns between curly braces. It still works fine because we never produce an X86insertps dag node if we don't have SSE4.1 :-) andreadb: You forgot to enclose both patterns between curly braces. It still works fine because we never…
				filcabAuthorUnsubmitted Not Done Reply Inline Actions The lack of {} is what you get when you add a second pattern and forget to add the {}. filcab: The lack of {} is what you get when you add a second pattern and forget to add the {}.
				// If we're inserting an element from a load or a null pshuf of a load,
				// fold the load into the insertps instruction.
				def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
				(scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
				imm:$src3)),
				(INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
				def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
				(loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
				(INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;

				let Predicates = [UseAVX] in
				andreadbUnsubmitted Not Done Reply Inline Actions Same here, you should enclose the following two patterns between curly braces. andreadb: Same here, you should enclose the following two patterns between curly braces.
				// If we're inserting an element from a vbroadcast of a load, fold the
				// load into the X86insertps instruction.
				def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
				(X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
				(VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
				def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
				(X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
				(VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// SSE4.1 - Round Instructions			// SSE4.1 - Round Instructions
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,			multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
	X86MemOperand x86memop, RegisterClass RC,			X86MemOperand x86memop, RegisterClass RC,
	PatFrag mem_frag32, PatFrag mem_frag64,			PatFrag mem_frag32, PatFrag mem_frag64,
	Intrinsic V4F32Int, Intrinsic V2F64Int> {			Intrinsic V4F32Int, Intrinsic V2F64Int> {
	▲ Show 20 Lines • Show All 2,298 Lines • Show Last 20 Lines

test/CodeGen/X86/avx2.ll

	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 \| FileCheck %s			; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 \| FileCheck %s -check-prefix=X32 --check-prefix=CHECK
				; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 \| FileCheck %s -check-prefix=X64 --check-prefix=CHECK

				declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone

	define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {			define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
	; CHECK-LABEL: @blendvb_fallback_v4i32			; CHECK-LABEL: @blendvb_fallback_v4i32
	; CHECK: vblendvps			; CHECK: vblendvps
	; CHECK: ret			; CHECK: ret
	%ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y			%ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
	ret <4 x i32> %ret			ret <4 x i32> %ret
	}			}

	define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {			define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
	; CHECK-LABEL: @blendvb_fallback_v8i32			; CHECK-LABEL: @blendvb_fallback_v8i32
	; CHECK: vblendvps			; CHECK: vblendvps
	; CHECK: ret			; CHECK: ret
	%ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y			%ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
	ret <8 x i32> %ret			ret <8 x i32> %ret
	}			}

	define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {			define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
	; CHECK-LABEL: @blendvb_fallback_v8f32			; CHECK-LABEL: @blendvb_fallback_v8f32
	; CHECK: vblendvps			; CHECK: vblendvps
	; CHECK: ret			; CHECK: ret
	%ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y			%ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
	ret <8 x float> %ret			ret <8 x float> %ret
	}			}

				define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
				; CHECK-LABEL: insertps_from_vector_load:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %pb, align 16
				%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
				ret <4 x float> %2
				}

				;; Use a non-zero CountS for insertps
				define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
				; CHECK-LABEL: insertps_from_vector_load_offset:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; CHECK-NOT: mov
				;; Try to match a bit more of the instr, since we need the load's offset.
				; CHECK: insertps $96, 4(%{{...}}), %
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %pb, align 16
				%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
				ret <4 x float> %2
				}

				define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
				; CHECK-LABEL: insertps_from_vector_load_offset_2:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; X32: movl 8(%esp), %ecx
				; CHECK-NOT: mov
				;; Try to match a bit more of the instr, since we need the load's offset.
				; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), %
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds <4 x float>* %pb, i64 %index
				%2 = load <4 x float>* %1, align 16
				%3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
				ret <4 x float> %3
				}

				define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
				; CHECK-LABEL: insertps_from_broadcast_loadf32:
				; On X32, account for the arguments' move to registers
				; X32: movl 8(%esp), %eax
				; X32: movl 4(%esp), %ecx
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds float* %fb, i64 %index
				%2 = load float* %1, align 4
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				ret <4 x float> %7
				}

				define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
				; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
				; On X32, account for the arguments' move to registers
				; X32: movl 4(%esp), %{{...}}
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %b, align 4
				%2 = extractelement <4 x float> %1, i32 0
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				ret <4 x float> %7
				}

				;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
				define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
				; CHECK-LABEL: insertps_from_broadcast_multiple_use:
				; On X32, account for the arguments' move to registers
				; X32: movl 8(%esp), %eax
				; X32: movl 4(%esp), %ecx
				; CHECK: vbroadcastss
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: vaddps
				; CHECK: vaddps
				; CHECK: vaddps
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds float* %fb, i64 %index
				%2 = load float* %1, align 4
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				%8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
				%9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
				%10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
				%11 = fadd <4 x float> %7, %8
				%12 = fadd <4 x float> %9, %10
				%13 = fadd <4 x float> %11, %12
				ret <4 x float> %13
				}

test/CodeGen/X86/fold-load-vec.ll

	; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1 \| FileCheck %s			; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1 \| FileCheck %s

	; rdar://12721174			; rdar://12721174
	; We should not fold movss into pshufd since pshufd expects m128 while movss			; We should not fold movss into pshufd since pshufd expects m128 while movss
	; loads from m32.			; loads from m32.
	define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {			define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
	; CHECK: sample_test			; CHECK: sample_test
	; CHECK: movaps			; CHECK-NOT: movaps
	; CHECK: insertps			; CHECK: insertps
	entry:			entry:
	%source.addr = alloca <4 x float>*, align 8			%source.addr = alloca <4 x float>*, align 8
	%dest.addr = alloca <2 x float>*, align 8			%dest.addr = alloca <2 x float>*, align 8
	%tmp = alloca <2 x float>, align 8			%tmp = alloca <2 x float>, align 8
	store <4 x float>* %source, <4 x float>** %source.addr, align 8			store <4 x float>* %source, <4 x float>** %source.addr, align 8
	store <2 x float>* %dest, <2 x float>** %dest.addr, align 8			store <2 x float>* %dest, <2 x float>** %dest.addr, align 8
	store <2 x float> zeroinitializer, <2 x float>* %tmp, align 8			store <2 x float> zeroinitializer, <2 x float>* %tmp, align 8
	Show All 23 Lines

test/CodeGen/X86/sse41.ll

	Show First 20 Lines • Show All 578 Lines • ▼ Show 20 Lines

	define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {			define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
	; CHECK-LABEL: blendvb_fallback			; CHECK-LABEL: blendvb_fallback
	; CHECK: blendvb			; CHECK: blendvb
	; CHECK: ret			; CHECK: ret
	%ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y			%ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
	ret <8 x i16> %ret			ret <8 x i16> %ret
	}			}

				define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
				; CHECK-LABEL: insertps_from_vector_load:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %pb, align 16
				%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
				ret <4 x float> %2
				}

				;; Use a non-zero CountS for insertps
				define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
				; CHECK-LABEL: insertps_from_vector_load_offset:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; CHECK-NOT: mov
				;; Try to match a bit more of the instr, since we need the load's offset.
				; CHECK: insertps $96, 4(%{{...}}), %
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %pb, align 16
				%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
				ret <4 x float> %2
				}

				define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
				; CHECK-LABEL: insertps_from_vector_load_offset_2:
				; On X32, account for the argument's move to registers
				; X32: movl 4(%esp), %eax
				; X32: movl 8(%esp), %ecx
				; CHECK-NOT: mov
				;; Try to match a bit more of the instr, since we need the load's offset.
				; CHECK: insertps $192, 12(%{{...}},%{{...}}), %
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds <4 x float>* %pb, i64 %index
				%2 = load <4 x float>* %1, align 16
				%3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
				ret <4 x float> %3
				}

				define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
				; CHECK-LABEL: insertps_from_broadcast_loadf32:
				; On X32, account for the arguments' move to registers
				; X32: movl 8(%esp), %eax
				; X32: movl 4(%esp), %ecx
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds float* %fb, i64 %index
				%2 = load float* %1, align 4
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				ret <4 x float> %7
				}

				define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
				; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
				; On X32, account for the arguments' move to registers
				; X32: movl 4(%esp), %{{...}}
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK-NEXT: ret
				%1 = load <4 x float>* %b, align 4
				%2 = extractelement <4 x float> %1, i32 0
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				ret <4 x float> %7
				}

				;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
				define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
				; CHECK-LABEL: insertps_from_broadcast_multiple_use:
				; On X32, account for the arguments' move to registers
				; X32: movl 8(%esp), %eax
				; X32: movl 4(%esp), %ecx
				; CHECK: movss
				; CHECK-NOT: mov
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: insertps $48
				; CHECK: addps
				; CHECK: addps
				; CHECK: addps
				; CHECK-NEXT: ret
				%1 = getelementptr inbounds float* %fb, i64 %index
				%2 = load float* %1, align 4
				%3 = insertelement <4 x float> undef, float %2, i32 0
				%4 = insertelement <4 x float> %3, float %2, i32 1
				%5 = insertelement <4 x float> %4, float %2, i32 2
				%6 = insertelement <4 x float> %5, float %2, i32 3
				%7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
				%8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
				%9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
				%10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
				%11 = fadd <4 x float> %7, %8
				%12 = fadd <4 x float> %9, %10
				%13 = fadd <4 x float> %11, %12
				ret <4 x float> %13
				}

This is an archive of the discontinued LLVM Phabricator instance.

Added more insertps optimizations
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 9503

lib/Target/X86/X86ISelLowering.cpp

lib/Target/X86/X86InstrSSE.td

test/CodeGen/X86/avx2.ll

test/CodeGen/X86/fold-load-vec.ll

test/CodeGen/X86/sse41.ll

This is an archive of the discontinued LLVM Phabricator instance.

Added more insertps optimizationsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 9503

lib/Target/X86/X86ISelLowering.cpp

lib/Target/X86/X86InstrSSE.td

test/CodeGen/X86/avx2.ll

test/CodeGen/X86/fold-load-vec.ll

test/CodeGen/X86/sse41.ll

Added more insertps optimizations
ClosedPublic