This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE1] Add MOVLHPS/MOVHLPS lowering and memory folding support
ClosedPublic

Authored by RKSimon on Feb 7 2016, 3:40 AM.

Download Raw Diff

Details

Reviewers

spatel
qcolombet
delena
andreadb

Commits

rGa207436b01b1: [X86][SSE1] Add MOVLHPS/MOVHLPS lowering and memory folding support
rL260168: [X86][SSE1] Add MOVLHPS/MOVHLPS lowering and memory folding support

Summary

As discussed on PR26491, this patch adds support for lowering v4f32 shuffles to the MOVLHPS/MOVHLPS instructions. It also adds support for memory folding with their MOVLPS/MOVHPS load equivalents.

This first patch only really helps SSE1 targets as SSE2+ targets will widen the shuffle mask and use v2f64 equivalents (although they still combine to MOVLHPS/MOVHLPS for v2f64 splats). This will have to be addressed in a future patch, most likely when we add support for binary target shuffle combines.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon updated this revision to Diff 47121.Feb 7 2016, 3:40 AM

RKSimon retitled this revision from to [X86][SSE1] Add MOVLHPS/MOVHLPS lowering and memory folding support.

RKSimon updated this object.

RKSimon added reviewers: qcolombet, delena, andreadb, spatel.

RKSimon set the repository for this revision to rL LLVM.

RKSimon added a subscriber: llvm-commits.

Shouldn't MOVHPS/MOVLPS/MOVLHPS/MOVHLPS be in "hasPartialRegUpdate()"? I wonder if the partial reg update is why we weren't trying to lower these before.

If that's correct, I think a FIXME comment is fine for now.

lib/Target/X86/X86InstrInfo.cpp
5541–5549	Extra indent here.
5542	Does the AVX case doesn't need the alignment restriction? I don't think we'd actually generate this instruction in the first place if we have AVX, so it might be a moot point. But maybe worthy of a code comment.

Updated based on Sanjay's comments.

spatel added inline comments.Feb 7 2016, 1:37 PM

lib/Target/X86/X86InstrInfo.cpp
18–1	Should MOVHPS/MOVLPS/MOVLHPS/MOVHLPS be in this list?

Thanks Sanjay - I missed that comment! The AMD 15h SOG highlights the MOVHP*/MOVLP* loads as having a merge dependency, I've added them to hasPartialRegUpdate().

Please see another inline comment for hasPartialRegUpdate(). LGTM after that is addressed.

lib/Target/X86/X86InstrInfo.cpp
19–22	We're creating MOVLHPSrr / MOVHLPSrr with this patch, so I think those should be included too.

This revision is now accepted and ready to land.Feb 8 2016, 7:06 AM

Closed by commit rL260168: [X86][SSE1] Add MOVLHPS/MOVHLPS lowering and memory folding support (authored by RKSimon). · Explain WhyFeb 8 2016, 3:08 PM

This revision was automatically updated to reflect the committed changes.

RKSimon mentioned this in D23027: [X86][SSE] Avoid specifying unused arguments in SHUFPD lowering.Aug 1 2016, 12:12 PM

RKSimon mentioned this in rL279430: [X86][SSE] Avoid specifying unused arguments in SHUFPD lowering.Aug 22 2016, 6:05 AM

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

6 lines

X86InstrInfo.cpp

18 lines

test/

CodeGen/

X86/

vector-shuffle-sse1.ll

29 lines

Diff 47121

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 8,997 Lines • ▼ Show 20 Lines	if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
return V;		return V;

if (!isSingleSHUFPSMask(Mask))		if (!isSingleSHUFPSMask(Mask))
if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(		if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
DL, MVT::v4f32, V1, V2, Mask, DAG))		DL, MVT::v4f32, V1, V2, Mask, DAG))
return BlendPerm;		return BlendPerm;
}		}

		// Use low/high mov instructions.
		if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
		return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
		if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
		return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

// Use dedicated unpack instructions for masks that match their pattern.		// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V =		if (SDValue V =
lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))		lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
return V;		return V;

// Otherwise fall back to a SHUFPS lowering strategy.		// Otherwise fall back to a SHUFPS lowering strategy.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);		return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
}		}
▲ Show 20 Lines • Show All 20,334 Lines • Show Last 20 Lines

lib/Target/X86/X86InstrInfo.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show All 10 Lines
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "X86InstrInfo.h"		#include "X86InstrInfo.h"
#include "X86.h"		#include "X86.h"
#include "X86InstrBuilder.h"		#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"		#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"		#include "X86Subtarget.h"
#include "X86TargetMachine.h"		#include "X86TargetMachine.h"
#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/LiveVariables.h"		#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineConstantPool.h"		#include "llvm/CodeGen/MachineConstantPool.h"
		spatelUnsubmitted Not Done Reply Inline Actions We're creating MOVLHPSrr / MOVHLPSrr with this patch, so I think those should be included too. spatel: We're creating MOVLHPSrr / MOVHLPSrr with this patch, so I think those should be included too.
#include "llvm/CodeGen/MachineDominators.h"		#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"		#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"		#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"		#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/StackMaps.h"		#include "llvm/CodeGen/StackMaps.h"
#include "llvm/IR/DerivedTypes.h"		#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"		#include "llvm/IR/LLVMContext.h"
▲ Show 20 Lines • Show All 962 Lines • ▼ Show 20 Lines	static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::MAXSSrr, X86::MAXSSrm, 0 },		{ X86::MAXSSrr, X86::MAXSSrm, 0 },
{ X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 },		{ X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 },
{ X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },		{ X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
{ X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },		{ X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
{ X86::MINSDrr, X86::MINSDrm, 0 },		{ X86::MINSDrr, X86::MINSDrm, 0 },
{ X86::MINSDrr_Int, X86::MINSDrm_Int, 0 },		{ X86::MINSDrr_Int, X86::MINSDrm_Int, 0 },
{ X86::MINSSrr, X86::MINSSrm, 0 },		{ X86::MINSSrr, X86::MINSSrm, 0 },
{ X86::MINSSrr_Int, X86::MINSSrm_Int, 0 },		{ X86::MINSSrr_Int, X86::MINSSrm_Int, 0 },
		{ X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },		{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },		{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },		{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
{ X86::MULSDrr, X86::MULSDrm, 0 },		{ X86::MULSDrr, X86::MULSDrm, 0 },
{ X86::MULSDrr_Int, X86::MULSDrm_Int, 0 },		{ X86::MULSDrr_Int, X86::MULSDrm_Int, 0 },
{ X86::MULSSrr, X86::MULSSrm, 0 },		{ X86::MULSSrr, X86::MULSSrm, 0 },
{ X86::MULSSrr_Int, X86::MULSSrm_Int, 0 },		{ X86::MULSSrr_Int, X86::MULSSrm_Int, 0 },
{ X86::OR16rr, X86::OR16rm, 0 },		{ X86::OR16rr, X86::OR16rm, 0 },
▲ Show 20 Lines • Show All 284 Lines • ▼ Show 20 Lines	static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VMAXSSrr, X86::VMAXSSrm, 0 },		{ X86::VMAXSSrr, X86::VMAXSSrm, 0 },
{ X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 },		{ X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 },
{ X86::VMINPDrr, X86::VMINPDrm, 0 },		{ X86::VMINPDrr, X86::VMINPDrm, 0 },
{ X86::VMINPSrr, X86::VMINPSrm, 0 },		{ X86::VMINPSrr, X86::VMINPSrm, 0 },
{ X86::VMINSDrr, X86::VMINSDrm, 0 },		{ X86::VMINSDrr, X86::VMINSDrm, 0 },
{ X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 },		{ X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 },
{ X86::VMINSSrr, X86::VMINSSrm, 0 },		{ X86::VMINSSrr, X86::VMINSSrm, 0 },
{ X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 },		{ X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 },
		{ X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
{ X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },		{ X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
{ X86::VMULPDrr, X86::VMULPDrm, 0 },		{ X86::VMULPDrr, X86::VMULPDrm, 0 },
{ X86::VMULPSrr, X86::VMULPSrm, 0 },		{ X86::VMULPSrr, X86::VMULPSrm, 0 },
{ X86::VMULSDrr, X86::VMULSDrm, 0 },		{ X86::VMULSDrr, X86::VMULSDrm, 0 },
{ X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 },		{ X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 },
{ X86::VMULSSrr, X86::VMULSSrm, 0 },		{ X86::VMULSSrr, X86::VMULSSrm, 0 },
{ X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 },		{ X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 },
{ X86::VORPDrr, X86::VORPDrm, 0 },		{ X86::VORPDrr, X86::VORPDrm, 0 },
▲ Show 20 Lines • Show All 4,217 Lines • ▼ Show 20 Lines	if (OpNum == 2) {
: X86::INSERTPSrm);		: X86::INSERTPSrm);
MachineInstr *NewMI =		MachineInstr *NewMI =
FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);		FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);		NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
return NewMI;		return NewMI;
}		}
}		}
break;		break;
		case X86::MOVHLPSrr:
		case X86::VMOVHLPSrr:
		// Move the upper 64-bits of the second operand to the lower 64-bits.
		// To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
		if (OpNum == 2) {
		unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
		if (Size <= RCSize && 8 <= Align) {
		spatelUnsubmitted Not Done Reply Inline Actions Does the AVX case doesn't need the alignment restriction? I don't think we'd actually generate this instruction in the first place if we have AVX, so it might be a moot point. But maybe worthy of a code comment. spatel: Does the AVX case doesn't need the alignment restriction? I don't think we'd actually generate…
		unsigned NewOpCode =
		(MI->getOpcode() == X86::VMOVHLPSrr ? X86::VMOVLPSrm
		: X86::MOVLPSrm);
		MachineInstr *NewMI =
		FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
		return NewMI;
		}
		spatelUnsubmitted Not Done Reply Inline Actions Extra indent here. spatel: Extra indent here.
		}
		break;
};		};

return nullptr;		return nullptr;
}		}

MachineInstr *X86InstrInfo::foldMemoryOperandImpl(		MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr *MI, unsigned OpNum,		MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,		ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
▲ Show 20 Lines • Show All 1,807 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-sse1.ll

	Show First 20 Lines • Show All 88 Lines • ▼ Show 20 Lines
	; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]			; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
	; SSE1-NEXT: retq			; SSE1-NEXT: retq
	%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>			%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
	ret <4 x float> %shuffle			ret <4 x float> %shuffle
	}			}
	define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {			define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
	; SSE1-LABEL: shuffle_v4f32_0145:			; SSE1-LABEL: shuffle_v4f32_0145:
	; SSE1: # BB#0:			; SSE1: # BB#0:
	; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]			; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; SSE1-NEXT: retq			; SSE1-NEXT: retq
	%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>			%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
	ret <4 x float> %shuffle			ret <4 x float> %shuffle
	}			}
	define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {			define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
	; SSE1-LABEL: shuffle_v4f32_6723:			; SSE1-LABEL: shuffle_v4f32_6723:
	; SSE1: # BB#0:			; SSE1: # BB#0:
	; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[2,3]			; SSE1-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
	; SSE1-NEXT: movaps %xmm1, %xmm0
	; SSE1-NEXT: retq			; SSE1-NEXT: retq
	%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>			%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
	ret <4 x float> %shuffle			ret <4 x float> %shuffle
	}			}

	define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {			define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
	; SSE1-LABEL: shuffle_v4f32_4zzz:			; SSE1-LABEL: shuffle_v4f32_4zzz:
	; SSE1: # BB#0:			; SSE1: # BB#0:
	▲ Show 20 Lines • Show All 91 Lines • ▼ Show 20 Lines
	; SSE1-NEXT: movq (%rdi), %rax			; SSE1-NEXT: movq (%rdi), %rax
	; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)			; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
	; SSE1-NEXT: shrq $32, %rax			; SSE1-NEXT: shrq $32, %rax
	; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)			; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
	; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero			; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero			; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]			; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
	; SSE1-NEXT: xorps %xmm2, %xmm2			; SSE1-NEXT: xorps %xmm2, %xmm2
	; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]			; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
	; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]			; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
	; SSE1-NEXT: movaps %xmm1, %xmm0			; SSE1-NEXT: movaps %xmm1, %xmm0
	; SSE1-NEXT: retq			; SSE1-NEXT: retq
	%a = load <2 x float>, <2 x float>* %ptr			%a = load <2 x float>, <2 x float>* %ptr
	%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>			%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
	%shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>			%shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	ret <4 x float> %shuffle			ret <4 x float> %shuffle
	}			}

	define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {			define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
	; SSE1-LABEL: insert_mem_hi_v4f32:			; SSE1-LABEL: insert_mem_hi_v4f32:
	; SSE1: # BB#0:			; SSE1: # BB#0:
	; SSE1-NEXT: movq (%rdi), %rax			; SSE1-NEXT: movq (%rdi), %rax
	; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)			; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
	; SSE1-NEXT: shrq $32, %rax			; SSE1-NEXT: shrq $32, %rax
	; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)			; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
	; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero			; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero			; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]			; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
	; SSE1-NEXT: xorps %xmm2, %xmm2			; SSE1-NEXT: xorps %xmm2, %xmm2
	; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]			; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
	; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]			; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; SSE1-NEXT: retq			; SSE1-NEXT: retq
	%a = load <2 x float>, <2 x float>* %ptr			%a = load <2 x float>, <2 x float>* %ptr
	%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>			%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
	%shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>			%shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
	ret <4 x float> %shuffle			ret <4 x float> %shuffle
	}			}

	define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {			define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
	; SSE1-LABEL: shuffle_mem_v4f32_3210:			; SSE1-LABEL: shuffle_mem_v4f32_3210:
	; SSE1: # BB#0:			; SSE1: # BB#0:
	; SSE1-NEXT: movaps (%rdi), %xmm0			; SSE1-NEXT: movaps (%rdi), %xmm0
	; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]			; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
	; SSE1-NEXT: retq			; SSE1-NEXT: retq
	%a = load <4 x float>, <4 x float>* %ptr			%a = load <4 x float>, <4 x float>* %ptr
	%shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>			%shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
	ret <4 x float> %shuffle			ret <4 x float> %shuffle
	}			}
				define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) {
				; SSE1-LABEL: shuffle_mem_v4f32_0145:
				; SSE1: # BB#0:
				; SSE1-NEXT: movhps (%rdi), %xmm0
				; SSE1-NEXT: retq
				%b = load <4 x float>, <4 x float>* %pb, align 16
				%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
				ret <4 x float> %shuffle
				}
				define <4 x float> @shuffle_mem_v4f32_6723(<4 x float> %a, <4 x float>* %pb) {
				; SSE1-LABEL: shuffle_mem_v4f32_6723:
				; SSE1: # BB#0:
				; SSE1-NEXT: movlps 8(%rdi), %xmm0
				; SSE1-NEXT: retq
				%b = load <4 x float>, <4 x float>* %pb, align 16
				%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
				ret <4 x float> %shuffle
				}