This is an archive of the discontinued LLVM Phabricator instance.

merge consecutive 16-byte loads into one 32-byte load (PR22329)
ClosedPublic

Authored by spatel on Jan 30 2015, 12:00 PM.

Download Raw Diff

Details

Reviewers

RKSimon
delena
mkuper
hfinkel

Commits

rGb7d562878440: Merge consecutive 16-byte loads into one 32-byte load (PR22329)
rL228006: Merge consecutive 16-byte loads into one 32-byte load (PR22329)

Summary

This patch detects consecutive vector loads using the existing EltsFromConsecutiveLoads() logic. This fixes:
http://llvm.org/bugs/show_bug.cgi?id=22329

This patch effectively reverts the tablegen additions of D6492 / http://reviews.llvm.org/rL224344 ...which in hindsight were a horrible hack. :)

The test cases that were added with that patch are simply modified to load from varying offsets of a base pointer. These loads did not match the existing tablegen patterns.

A happy side effect of doing this optimization earlier is that we can now fold the load into a math op where possible; this is shown in some of the updated checks in the test file.

Diff Detail

Event Timeline

spatel updated this revision to Diff 19050.Jan 30 2015, 12:00 PM

spatel retitled this revision from to merge consecutive 16-byte loads into one 32-byte load (PR22329).

spatel updated this object.

spatel edited the test plan for this revision. (Show Details)

spatel added reviewers: delena, hfinkel, mkuper, RKSimon.

spatel added a subscriber: Unknown Object (MLST).

mkuper added inline comments.Feb 1 2015, 6:37 AM

lib/Target/X86/X86ISelLowering.cpp
6099	If I'm reading this correctly, before this change, if we got here, then the size of VT always matched the size of the found consecutive load (VT.getSizeInBits() == EltVt.getSizeInBits() * NumElems). With this change, I think that no longer holds. The size of the consecutive load we find is LdVT.getSizeInBits() * Elts.size(), but there's no guarantee that this is actually the size of VT. The responsibility for ensuring this condition holds has moved to the caller. I think we now need an additional check that the sizes indeed match.
13216	You probably also want to check that Idx is what you expect it to be.

spatel added inline comments.Feb 1 2015, 9:49 AM

lib/Target/X86/X86ISelLowering.cpp
6099	Previously, I think there was an implicit assumption that each of the Elts was a matching scalar load of VT.getVectorElementType(); this was safe assuming the Elts were extracted from a build_vector. But yes, I agree. I will add a check inside the loop to confirm that each element matches the fractional size that we're expecting, and I will add a check after the loop to confirm that the cumulative size of the element loads matches the total size of the vector type (VT).
13216	Agree again - this is too lax. I will add checks to make sure that both insert_subvector indices match what we need.

Added type checks for loads. Added insertion index checks for both subvector instructions.
Thanks, Michael!

LGTM

This revision is now accepted and ready to land.Feb 3 2015, 2:02 AM

Closed by commit rL228006: Merge consecutive 16-byte loads into one 32-byte load (PR22329) (authored by spatel). · Explain WhyFeb 3 2015, 10:56 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

23 lines

X86InstrSSE.td

43 lines

test/

CodeGen/

X86/

unaligned-32-byte-memops.ll

67 lines

Diff 19050

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,052 Lines • ▼ Show 20 Lines
/// Example: <load i32 a, load i32 a+4, undef, undef> -> zextload a		/// Example: <load i32 a, load i32 a+4, undef, undef> -> zextload a
///		///
/// FIXME: we'd also like to handle the case where the last elements are zero		/// FIXME: we'd also like to handle the case where the last elements are zero
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.		/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
/// There's even a handy isZeroNode for that purpose.		/// There's even a handy isZeroNode for that purpose.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,		static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDLoc &DL, SelectionDAG &DAG,		SDLoc &DL, SelectionDAG &DAG,
bool isAfterLegalize) {		bool isAfterLegalize) {
EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();		unsigned NumElems = Elts.size();

LoadSDNode *LDBase = nullptr;		LoadSDNode *LDBase = nullptr;
unsigned LastLoadedElt = -1U;		unsigned LastLoadedElt = -1U;

// For each element in the initializer, see if we've found a load or an undef.		// For each element in the initializer, see if we've found a load or an undef.
// If we don't find an initial load element, or later load elements are		// If we don't find an initial load element, or later load elements are
// non-consecutive, bail out.		// non-consecutive, bail out.
for (unsigned i = 0; i < NumElems; ++i) {		for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Elts[i];		SDValue Elt = Elts[i];
		// Look through a bitcast.
		if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
		Elt = Elt.getOperand(0);
if (!Elt.getNode() \|\|		if (!Elt.getNode() \|\|
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))		(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
return SDValue();		return SDValue();
if (!LDBase) {		if (!LDBase) {
if (Elt.getNode()->getOpcode() == ISD::UNDEF)		if (Elt.getNode()->getOpcode() == ISD::UNDEF)
return SDValue();		return SDValue();
LDBase = cast<LoadSDNode>(Elt.getNode());		LDBase = cast<LoadSDNode>(Elt.getNode());
LastLoadedElt = i;		LastLoadedElt = i;
continue;		continue;
}		}
if (Elt.getOpcode() == ISD::UNDEF)		if (Elt.getOpcode() == ISD::UNDEF)
continue;		continue;

LoadSDNode *LD = cast<LoadSDNode>(Elt);		LoadSDNode *LD = cast<LoadSDNode>(Elt);
if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))		EVT LdVT = Elt.getValueType();
		if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
return SDValue();		return SDValue();
LastLoadedElt = i;		LastLoadedElt = i;
}		}

// If we have found an entire vector of loads and undefs, then return a large		// If we have found an entire vector of loads and undefs, then return a large
// load of the entire vector width starting at the base pointer. If we found		// load of the entire vector width starting at the base pointer. If we found
// consecutive loads for the low half, generate a vzext_load node.		// consecutive loads for the low half, generate a vzext_load node.
if (LastLoadedElt == NumElems - 1) {		if (LastLoadedElt == NumElems - 1) {

if (isAfterLegalize &&		if (isAfterLegalize &&
		mkuperUnsubmitted Not Done Reply Inline Actions If I'm reading this correctly, before this change, if we got here, then the size of VT always matched the size of the found consecutive load (VT.getSizeInBits() == EltVt.getSizeInBits() * NumElems). With this change, I think that no longer holds. The size of the consecutive load we find is LdVT.getSizeInBits() * Elts.size(), but there's no guarantee that this is actually the size of VT. The responsibility for ensuring this condition holds has moved to the caller. I think we now need an additional check that the sizes indeed match. mkuper: If I'm reading this correctly, before this change, if we got here, then the size of VT always…
		spatelAuthorUnsubmitted Not Done Reply Inline Actions Previously, I think there was an implicit assumption that each of the Elts was a matching scalar load of VT.getVectorElementType(); this was safe assuming the Elts were extracted from a build_vector. But yes, I agree. I will add a check inside the loop to confirm that each element matches the fractional size that we're expecting, and I will add a check after the loop to confirm that the cumulative size of the element loads matches the total size of the vector type (VT). spatel: Previously, I think there was an implicit assumption that each of the Elts was a matching…
!DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))		!DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
return SDValue();		return SDValue();

SDValue NewLd = SDValue();		SDValue NewLd = SDValue();

NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),		NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->isVolatile(),		LDBase->getPointerInfo(), LDBase->isVolatile(),
LDBase->isNonTemporal(), LDBase->isInvariant(),		LDBase->isNonTemporal(), LDBase->isInvariant(),
LDBase->getAlignment());		LDBase->getAlignment());

if (LDBase->hasAnyUseOfValue(1)) {		if (LDBase->hasAnyUseOfValue(1)) {
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,		SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
SDValue(LDBase, 1),		SDValue(LDBase, 1),
SDValue(NewLd.getNode(), 1));		SDValue(NewLd.getNode(), 1));
DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);		DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),		DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
SDValue(NewLd.getNode(), 1));		SDValue(NewLd.getNode(), 1));
}		}

return NewLd;		return NewLd;
}		}

//TODO: The code below fires only for for loading the low v2i32 / v2f32		//TODO: The code below fires only for for loading the low v2i32 / v2f32
//of a v4i32 / v4f32. It's probably worth generalizing.		//of a v4i32 / v4f32. It's probably worth generalizing.
		EVT EltVT = VT.getVectorElementType();
if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&		if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {		DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);		SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };		SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode =		SDValue ResNode =
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,		DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
LDBase->getPointerInfo(),		LDBase->getPointerInfo(),
LDBase->getAlignment(),		LDBase->getAlignment(),
▲ Show 20 Lines • Show All 7,071 Lines • ▼ Show 20 Lines	if (!Subtarget->hasAVX())
return SDValue();		return SDValue();

SDLoc dl(Op);		SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);		SDValue Vec = Op.getOperand(0);
SDValue SubVec = Op.getOperand(1);		SDValue SubVec = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);		SDValue Idx = Op.getOperand(2);
MVT OpVT = Op.getSimpleValueType();		MVT OpVT = Op.getSimpleValueType();
MVT SubVecVT = SubVec.getSimpleValueType();		MVT SubVecVT = SubVec.getSimpleValueType();

		// Fold two 16-byte subvector loads into one 32-byte load:
		// (insert_subvector (insert_subvector undef, (load addr)), (load addr + 16))
		// --> load32 addr
		if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
		mkuperUnsubmitted Not Done Reply Inline Actions You probably also want to check that Idx is what you expect it to be. mkuper: You probably also want to check that Idx is what you expect it to be.
		spatelAuthorUnsubmitted Not Done Reply Inline Actions Agree again - this is too lax. I will add checks to make sure that both insert_subvector indices match what we need. spatel: Agree again - this is too lax. I will add checks to make sure that both insert_subvector…
		OpVT.is256BitVector() &&
		!Subtarget->isUnalignedMem32Slow()) {
		SDValue Ops[] = { Vec.getOperand(1), SubVec };
		SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
		if (LD.getNode())
		return LD;
		}

if ((OpVT.is256BitVector() \|\| OpVT.is512BitVector()) &&		if ((OpVT.is256BitVector() \|\| OpVT.is512BitVector()) &&
SubVecVT.is128BitVector() && isa<ConstantSDNode>(Idx)) {		SubVecVT.is128BitVector() && isa<ConstantSDNode>(Idx)) {
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();		unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);		return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
}		}

if (OpVT.is512BitVector() &&		if (OpVT.is512BitVector() &&
SubVecVT.is256BitVector() && isa<ConstantSDNode>(Idx)) {		SubVecVT.is256BitVector() && isa<ConstantSDNode>(Idx)) {
▲ Show 20 Lines • Show All 13,589 Lines • Show Last 20 Lines

lib/Target/X86/X86InstrSSE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 8,135 Lines • ▼ Show 20 Lines	def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
(VINSERTF128rm VR256:$src1, addr:$src2,		(VINSERTF128rm VR256:$src1, addr:$src2,
(INSERT_get_vinsert128_imm VR256:$ins))>;		(INSERT_get_vinsert128_imm VR256:$ins))>;
def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),		def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
(iPTR imm)),		(iPTR imm)),
(VINSERTF128rm VR256:$src1, addr:$src2,		(VINSERTF128rm VR256:$src1, addr:$src2,
(INSERT_get_vinsert128_imm VR256:$ins))>;		(INSERT_get_vinsert128_imm VR256:$ins))>;
}		}

// Combine two consecutive 16-byte loads with a common destination register into
// one 32-byte load to that register.
let Predicates = [HasAVX, HasFastMem32] in {
def : Pat<(insert_subvector
(v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))),
(loadv4f32 (add addr:$src, (iPTR 16))),
(iPTR 4)),
(VMOVUPSYrm addr:$src)>;

def : Pat<(insert_subvector
(v4f64 (insert_subvector undef, (loadv2f64 addr:$src), (iPTR 0))),
(loadv2f64 (add addr:$src, (iPTR 16))),
(iPTR 2)),
(VMOVUPDYrm addr:$src)>;

def : Pat<(insert_subvector
(v32i8 (insert_subvector
undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))),
(bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))),
(iPTR 16)),
(VMOVDQUYrm addr:$src)>;

def : Pat<(insert_subvector
(v16i16 (insert_subvector
undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))),
(bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))),
(iPTR 8)),
(VMOVDQUYrm addr:$src)>;

def : Pat<(insert_subvector
(v8i32 (insert_subvector
undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))),
(bc_v4i32 (loadv2i64 (add addr:$src, (iPTR 16)))),
(iPTR 4)),
(VMOVDQUYrm addr:$src)>;

def : Pat<(insert_subvector
(v4i64 (insert_subvector undef, (loadv2i64 addr:$src), (iPTR 0))),
(loadv2i64 (add addr:$src, (iPTR 16))),
(iPTR 2)),
(VMOVDQUYrm addr:$src)>;
}

let Predicates = [HasAVX1Only] in {		let Predicates = [HasAVX1Only] in {
def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),		def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
(iPTR imm)),		(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,		(VINSERTF128rr VR256:$src1, VR128:$src2,
(INSERT_get_vinsert128_imm VR256:$ins))>;		(INSERT_get_vinsert128_imm VR256:$ins))>;
def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),		def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
(iPTR imm)),		(iPTR imm)),
(VINSERTF128rr VR256:$src1, VR128:$src2,		(VINSERTF128rr VR256:$src1, VR128:$src2,
▲ Show 20 Lines • Show All 981 Lines • Show Last 20 Lines

test/CodeGen/X86/unaligned-32-byte-memops.ll

Show First 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
; SANDYB-NEXT: retq		; SANDYB-NEXT: retq

; BTVER2: vmovups		; BTVER2: vmovups
; BTVER2-NEXT: retq		; BTVER2-NEXT: retq

; HASWELL: vmovups		; HASWELL: vmovups
; HASWELL-NEXT: retq		; HASWELL-NEXT: retq

%ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1		%ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 1
%v1 = load <4 x float>* %ptr, align 1		%ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 2
		%v1 = load <4 x float>* %ptr1, align 1
%v2 = load <4 x float>* %ptr2, align 1		%v2 = load <4 x float>* %ptr2, align 1
%shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>		%shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)		%v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
ret <8 x float> %v3		ret <8 x float> %v3
}		}

; Swap the operands of the shufflevector and vinsertf128 to ensure that the		; Swap the operands of the shufflevector and vinsertf128 to ensure that the
; pattern still matches.		; pattern still matches.
define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {		define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
; CHECK-LABEL: combine_16_byte_loads_swap		; CHECK-LABEL: combine_16_byte_loads_swap

; SANDYB: vmovups		; SANDYB: vmovups
; SANDYB-NEXT: vinsertf128		; SANDYB-NEXT: vinsertf128
; SANDYB-NEXT: retq		; SANDYB-NEXT: retq

; BTVER2: vmovups		; BTVER2: vmovups
; BTVER2-NEXT: retq		; BTVER2-NEXT: retq

; HASWELL: vmovups		; HASWELL: vmovups
; HASWELL-NEXT: retq		; HASWELL-NEXT: retq

%ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1		%ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 2
%v1 = load <4 x float>* %ptr, align 1		%ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 3
		%v1 = load <4 x float>* %ptr1, align 1
%v2 = load <4 x float>* %ptr2, align 1		%v2 = load <4 x float>* %ptr2, align 1
%shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>		%shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
%v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)		%v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
ret <8 x float> %v3		ret <8 x float> %v3
}		}

; Replace the vinsertf128 intrinsic with a shufflevector as might be		; Replace the vinsertf128 intrinsic with a shufflevector as might be
; expected from auto-vectorized code.		; expected from auto-vectorized code.
define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {		define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
; CHECK-LABEL: combine_16_byte_loads_no_intrinsic		; CHECK-LABEL: combine_16_byte_loads_no_intrinsic

; SANDYB: vmovups		; SANDYB: vmovups
; SANDYB-NEXT: vinsertf128		; SANDYB-NEXT: vinsertf128
; SANDYB-NEXT: retq		; SANDYB-NEXT: retq

; BTVER2: vmovups		; BTVER2: vmovups
; BTVER2-NEXT: retq		; BTVER2-NEXT: retq

; HASWELL: vmovups		; HASWELL: vmovups
; HASWELL-NEXT: retq		; HASWELL-NEXT: retq

%ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1		%ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 3
%v1 = load <4 x float>* %ptr, align 1		%ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 4
		%v1 = load <4 x float>* %ptr1, align 1
%v2 = load <4 x float>* %ptr2, align 1		%v2 = load <4 x float>* %ptr2, align 1
%v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>		%v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %v3		ret <8 x float> %v3
}		}

; Swap the order of the shufflevector operands to ensure that the		; Swap the order of the shufflevector operands to ensure that the
; pattern still matches.		; pattern still matches.
define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {		define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap		; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap

; SANDYB: vmovups		; SANDYB: vmovups
; SANDYB-NEXT: vinsertf128		; SANDYB-NEXT: vinsertf128
; SANDYB-NEXT: retq		; SANDYB-NEXT: retq

; BTVER2: vmovups		; BTVER2: vmovups
; BTVER2-NEXT: retq		; BTVER2-NEXT: retq

; HASWELL: vmovups		; HASWELL: vmovups
; HASWELL-NEXT: retq		; HASWELL-NEXT: retq

%ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1		%ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 4
%v1 = load <4 x float>* %ptr, align 1		%ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 5
		%v1 = load <4 x float>* %ptr1, align 1
%v2 = load <4 x float>* %ptr2, align 1		%v2 = load <4 x float>* %ptr2, align 1
%v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>		%v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %v3		ret <8 x float> %v3
}		}

; Check each element type other than float to make sure it is handled correctly.		; Check each element type other than float to make sure it is handled correctly.
; Use the loaded values with an 'add' to make sure we're using the correct load type.		; Use the loaded values with an 'add' to make sure we're using the correct load type.
; Even though BtVer2 has fast 32-byte loads, we should not generate those for		; Even though BtVer2 has fast 32-byte loads, we should not generate those for
Show All 9 Lines	define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
; SANDYB-NEXT: retq		; SANDYB-NEXT: retq

; BTVER2: vextractf128		; BTVER2: vextractf128
; BTVER2-NEXT: vpaddq		; BTVER2-NEXT: vpaddq
; BTVER2-NEXT: vpaddq		; BTVER2-NEXT: vpaddq
; BTVER2-NEXT: vinsertf128		; BTVER2-NEXT: vinsertf128
; BTVER2-NEXT: retq		; BTVER2-NEXT: retq

; HASWELL: vmovdqu		; HASWELL-NOT: vextract
; HASWELL-NEXT: vpaddq		; HASWELL: vpaddq
; HASWELL-NEXT: retq		; HASWELL-NEXT: retq

%ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 1		%ptr1 = getelementptr inbounds <2 x i64>* %ptr, i64 5
%v1 = load <2 x i64>* %ptr, align 1		%ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 6
		%v1 = load <2 x i64>* %ptr1, align 1
%v2 = load <2 x i64>* %ptr2, align 1		%v2 = load <2 x i64>* %ptr2, align 1
%v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>		%v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v4 = add <4 x i64> %v3, %x		%v4 = add <4 x i64> %v3, %x
ret <4 x i64> %v4		ret <4 x i64> %v4
}		}

define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {		define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
; CHECK-LABEL: combine_16_byte_loads_i32		; CHECK-LABEL: combine_16_byte_loads_i32

; SANDYB: vextractf128		; SANDYB: vextractf128
; SANDYB-NEXT: vpaddd		; SANDYB-NEXT: vpaddd
; SANDYB-NEXT: vpaddd		; SANDYB-NEXT: vpaddd
; SANDYB-NEXT: vinsertf128		; SANDYB-NEXT: vinsertf128
; SANDYB-NEXT: retq		; SANDYB-NEXT: retq

; BTVER2: vextractf128		; BTVER2: vextractf128
; BTVER2-NEXT: vpaddd		; BTVER2-NEXT: vpaddd
; BTVER2-NEXT: vpaddd		; BTVER2-NEXT: vpaddd
; BTVER2-NEXT: vinsertf128		; BTVER2-NEXT: vinsertf128
; BTVER2-NEXT: retq		; BTVER2-NEXT: retq

; HASWELL: vmovdqu		; HASWELL-NOT: vextract
; HASWELL-NEXT: vpaddd		; HASWELL: vpaddd
; HASWELL-NEXT: retq		; HASWELL-NEXT: retq

%ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 1		%ptr1 = getelementptr inbounds <4 x i32>* %ptr, i64 6
%v1 = load <4 x i32>* %ptr, align 1		%ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 7
		%v1 = load <4 x i32>* %ptr1, align 1
%v2 = load <4 x i32>* %ptr2, align 1		%v2 = load <4 x i32>* %ptr2, align 1
%v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>		%v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v4 = add <8 x i32> %v3, %x		%v4 = add <8 x i32> %v3, %x
ret <8 x i32> %v4		ret <8 x i32> %v4
}		}

define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {		define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
; CHECK-LABEL: combine_16_byte_loads_i16		; CHECK-LABEL: combine_16_byte_loads_i16

; SANDYB: vextractf128		; SANDYB: vextractf128
; SANDYB-NEXT: vpaddw		; SANDYB-NEXT: vpaddw
; SANDYB-NEXT: vpaddw		; SANDYB-NEXT: vpaddw
; SANDYB-NEXT: vinsertf128		; SANDYB-NEXT: vinsertf128
; SANDYB-NEXT: retq		; SANDYB-NEXT: retq

; BTVER2: vextractf128		; BTVER2: vextractf128
; BTVER2-NEXT: vpaddw		; BTVER2-NEXT: vpaddw
; BTVER2-NEXT: vpaddw		; BTVER2-NEXT: vpaddw
; BTVER2-NEXT: vinsertf128		; BTVER2-NEXT: vinsertf128
; BTVER2-NEXT: retq		; BTVER2-NEXT: retq

; HASWELL: vmovdqu		; HASWELL-NOT: vextract
; HASWELL-NEXT: vpaddw		; HASWELL: vpaddw
; HASWELL-NEXT: retq		; HASWELL-NEXT: retq

%ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 1		%ptr1 = getelementptr inbounds <8 x i16>* %ptr, i64 7
%v1 = load <8 x i16>* %ptr, align 1		%ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 8
		%v1 = load <8 x i16>* %ptr1, align 1
%v2 = load <8 x i16>* %ptr2, align 1		%v2 = load <8 x i16>* %ptr2, align 1
%v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>		%v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v4 = add <16 x i16> %v3, %x		%v4 = add <16 x i16> %v3, %x
ret <16 x i16> %v4		ret <16 x i16> %v4
}		}

define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {		define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
; CHECK-LABEL: combine_16_byte_loads_i8		; CHECK-LABEL: combine_16_byte_loads_i8

; SANDYB: vextractf128		; SANDYB: vextractf128
; SANDYB-NEXT: vpaddb		; SANDYB-NEXT: vpaddb
; SANDYB-NEXT: vpaddb		; SANDYB-NEXT: vpaddb
; SANDYB-NEXT: vinsertf128		; SANDYB-NEXT: vinsertf128
; SANDYB-NEXT: retq		; SANDYB-NEXT: retq

; BTVER2: vextractf128		; BTVER2: vextractf128
; BTVER2-NEXT: vpaddb		; BTVER2-NEXT: vpaddb
; BTVER2-NEXT: vpaddb		; BTVER2-NEXT: vpaddb
; BTVER2-NEXT: vinsertf128		; BTVER2-NEXT: vinsertf128
; BTVER2-NEXT: retq		; BTVER2-NEXT: retq

; HASWELL: vmovdqu		; HASWELL-NOT: vextract
; HASWELL-NEXT: vpaddb		; HASWELL: vpaddb
; HASWELL-NEXT: retq		; HASWELL-NEXT: retq

%ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 1		%ptr1 = getelementptr inbounds <16 x i8>* %ptr, i64 8
%v1 = load <16 x i8>* %ptr, align 1		%ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 9
		%v1 = load <16 x i8>* %ptr1, align 1
%v2 = load <16 x i8>* %ptr2, align 1		%v2 = load <16 x i8>* %ptr2, align 1
%v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>		%v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%v4 = add <32 x i8> %v3, %x		%v4 = add <32 x i8> %v3, %x
ret <32 x i8> %v4		ret <32 x i8> %v4
}		}

define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {		define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
; CHECK-LABEL: combine_16_byte_loads_double		; CHECK-LABEL: combine_16_byte_loads_double

; SANDYB: vmovupd		; SANDYB: vmovupd
; SANDYB-NEXT: vinsertf128		; SANDYB-NEXT: vinsertf128
; SANDYB-NEXT: vaddpd		; SANDYB-NEXT: vaddpd
; SANDYB-NEXT: retq		; SANDYB-NEXT: retq

; BTVER2: vmovupd		; BTVER2-NOT: vinsertf128
; BTVER2-NEXT: vaddpd		; BTVER2: vaddpd
; BTVER2-NEXT: retq		; BTVER2-NEXT: retq

; HASWELL: vmovupd		; HASWELL-NOT: vinsertf128
; HASWELL: vaddpd		; HASWELL: vaddpd
; HASWELL-NEXT: retq		; HASWELL-NEXT: retq

%ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 1		%ptr1 = getelementptr inbounds <2 x double>* %ptr, i64 9
%v1 = load <2 x double>* %ptr, align 1		%ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 10
		%v1 = load <2 x double>* %ptr1, align 1
%v2 = load <2 x double>* %ptr2, align 1		%v2 = load <2 x double>* %ptr2, align 1
%v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>		%v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v4 = fadd <4 x double> %v3, %x		%v4 = fadd <4 x double> %v3, %x
ret <4 x double> %v4		ret <4 x double> %v4
}		}