This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Missing SSE/AVX1 memory folding integer instructions
ClosedPublic

Authored by RKSimon on Jan 21 2015, 7:28 AM.

Download Raw Diff

Details

Reviewers

spatel
qcolombet
andreadb

Commits

rG5fa0fb23ca5d: [X86][SSE] Missing SSE/AVX1 memory folding integer instructions
rL226745: [X86][SSE] Missing SSE/AVX1 memory folding integer instructions

Summary

Added most of the missing integer vector folding patterns for SSE (to SSE42) and AVX1.

The most useful of these are probably the i32/i64 extraction, i8/i16/i32/i64 insertions, zero/sign extension, unsigned saturation subtractions, i64 subtractions and the variable mask blends (pblendvb) - others include CLMUL, SSE42 string comparisons and bit tests.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon updated this revision to Diff 18514.Jan 21 2015, 7:28 AM

RKSimon retitled this revision from to [X86][SSE] Missing SSE/AVX1 memory folding integer instructions.

RKSimon updated this object.

RKSimon edited the test plan for this revision. (Show Details)

RKSimon added reviewers: qcolombet, andreadb, spatel.

RKSimon set the repository for this revision to rL LLVM.

RKSimon added a subscriber: Unknown Object (MLST).

Hi Simon,

Do you know why some of the psubusw are not folded with this patch?

Thanks,
-Quentin

test/CodeGen/X86/psubus.ll
29	Why is this not folded anymore?

Sorry some of those old tests aren't very clear as to what is going on at all.

test/CodeGen/X86/psubus.ll
29	Oddly the addition of the folding patterns has allowed the load of the constant (to %xmm0) to be pulled out of the loop.

qcolombet added inline comments.Jan 21 2015, 1:46 PM

test/CodeGen/X86/psubus.ll
29	Thanks for checking. Could we get rid of the loop to have a test on the folding?

Sure I can simplify psubus.ll tests - do you want it as part of this patch? We do already have the proper stack folding tests for (v)psubusb as well of course.

Sure I can simplify psubus.ll tests - do you want it as part of this patch?

No, a follow-up patch is fine.

We do already have the proper stack folding tests for (v)psubusb as well of course.

LGTM then :).

Thanks,
-Quentin

This revision is now accepted and ready to land.Jan 21 2015, 2:43 PM

Closed by commit rL226745: [X86][SSE] Missing SSE/AVX1 memory folding integer instructions (authored by RKSimon). · Explain WhyJan 21 2015, 3:45 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

X86/

	X86InstrInfo.cpp
	X86InstrInfo.cpp (revision 226662)

58 lines

test/

CodeGen/

X86/

	2011-11-30-or.ll
	2011-11-30-or.ll (revision 226662)

2 lines

	psubus.ll
	psubus.ll (revision 226662)

24 lines

	stack-folding-int-avx1.ll
	stack-folding-int-avx1.ll (revision 226662)

274 lines

	stack-folding-int-sse42.ll
	stack-folding-int-sse42.ll (revision 226662)

274 lines

Diff 18514

lib/Target/X86/X86InstrInfo.cpp

Show First 20 Lines • Show All 327 Lines • ▼ Show 20 Lines	static const X86OpTblEntry OpTbl0[] = {
{ X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },		{ X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
{ X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },		{ X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
{ X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },		{ X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
{ X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },		{ X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
{ X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },		{ X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
{ X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },		{ X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
{ X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },		{ X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
{ X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },		{ X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
		{ X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
		{ X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
{ X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },		{ X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
{ X86::SETAr, X86::SETAm, TB_FOLDED_STORE },		{ X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
{ X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },		{ X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
{ X86::SETBr, X86::SETBm, TB_FOLDED_STORE },		{ X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
{ X86::SETEr, X86::SETEm, TB_FOLDED_STORE },		{ X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
{ X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },		{ X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
{ X86::SETGr, X86::SETGm, TB_FOLDED_STORE },		{ X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
{ X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },		{ X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
Show All 18 Lines	static const X86OpTblEntry OpTbl0[] = {
{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE \| TB_ALIGN_16 },		{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE \| TB_ALIGN_16 },
{ X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE \| TB_ALIGN_16 },		{ X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE \| TB_ALIGN_16 },
{ X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },		{ X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },
{ X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },		{ X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
{ X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },		{ X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },
{ X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },		{ X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
{ X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },		{ X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
{ X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },		{ X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
		{ X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
		{ X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },
// AVX 256-bit foldable instructions		// AVX 256-bit foldable instructions
{ X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE \| TB_ALIGN_16 },		{ X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE \| TB_ALIGN_16 },
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE \| TB_ALIGN_32 },		{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE \| TB_ALIGN_32 },
{ X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE \| TB_ALIGN_32 },		{ X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE \| TB_ALIGN_32 },
{ X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE \| TB_ALIGN_32 },		{ X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE \| TB_ALIGN_32 },
{ X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },		{ X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
{ X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },		{ X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
// AVX-512 foldable instructions		// AVX-512 foldable instructions
▲ Show 20 Lines • Show All 110 Lines • ▼ Show 20 Lines	static const X86OpTblEntry OpTbl1[] = {
{ X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 },		{ X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 },
{ X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },		{ X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
{ X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },		{ X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
{ X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },		{ X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
{ X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },		{ X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
{ X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 },		{ X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 },
{ X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 },		{ X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 },
{ X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 },		{ X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 },
		{ X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 },
		{ X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 },
		{ X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 },
		{ X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 },
		{ X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 },
		{ X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_ALIGN_16 },
		{ X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_ALIGN_16 },
		{ X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_ALIGN_16 },
		{ X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_ALIGN_16 },
		{ X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_ALIGN_16 },
		{ X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_ALIGN_16 },
		{ X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_ALIGN_16 },
		{ X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_ALIGN_16 },
		{ X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_ALIGN_16 },
		{ X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_ALIGN_16 },
		{ X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_ALIGN_16 },
		{ X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_ALIGN_16 },
{ X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },		{ X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
{ X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },		{ X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
{ X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },		{ X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
		{ X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
{ X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },		{ X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
{ X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 },		{ X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 },
{ X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },		{ X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
{ X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },		{ X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
{ X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },		{ X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
{ X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, TB_ALIGN_16 },		{ X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, TB_ALIGN_16 },
{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },		{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },
{ X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 },		{ X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 },
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	static const X86OpTblEntry OpTbl1[] = {
{ X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 },		{ X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 },
{ X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },		{ X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
{ X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },		{ X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
{ X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 },		{ X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 },
{ X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 },		{ X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 },
{ X86::VPABSBrr128, X86::VPABSBrm128, 0 },		{ X86::VPABSBrr128, X86::VPABSBrm128, 0 },
{ X86::VPABSDrr128, X86::VPABSDrm128, 0 },		{ X86::VPABSDrr128, X86::VPABSDrm128, 0 },
{ X86::VPABSWrr128, X86::VPABSWrm128, 0 },		{ X86::VPABSWrr128, X86::VPABSWrm128, 0 },
		{ X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
		{ X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
		{ X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
		{ X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 },
		{ X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 },
{ X86::VPERMILPDri, X86::VPERMILPDmi, 0 },		{ X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
{ X86::VPERMILPSri, X86::VPERMILPSmi, 0 },		{ X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
		{ X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, 0 },
		{ X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, 0 },
		{ X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, 0 },
		{ X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, 0 },
		{ X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, 0 },
		{ X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, 0 },
		{ X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, 0 },
		{ X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, 0 },
		{ X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, 0 },
		{ X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, 0 },
		{ X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, 0 },
		{ X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, 0 },
{ X86::VPSHUFDri, X86::VPSHUFDmi, 0 },		{ X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
{ X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },		{ X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
{ X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },		{ X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
		{ X86::VPTESTrr, X86::VPTESTrm, 0 },
{ X86::VRCPPSr, X86::VRCPPSm, 0 },		{ X86::VRCPPSr, X86::VRCPPSm, 0 },
{ X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 },		{ X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 },
{ X86::VROUNDPDr, X86::VROUNDPDm, 0 },		{ X86::VROUNDPDr, X86::VROUNDPDm, 0 },
{ X86::VROUNDPSr, X86::VROUNDPSm, 0 },		{ X86::VROUNDPSr, X86::VROUNDPSm, 0 },
{ X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },		{ X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
{ X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 },		{ X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 },
{ X86::VSQRTPDr, X86::VSQRTPDm, 0 },		{ X86::VSQRTPDr, X86::VSQRTPDm, 0 },
{ X86::VSQRTPSr, X86::VSQRTPSm, 0 },		{ X86::VSQRTPSr, X86::VSQRTPSm, 0 },
▲ Show 20 Lines • Show All 301 Lines • ▼ Show 20 Lines	static const X86OpTblEntry OpTbl2[] = {
{ X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },		{ X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },
{ X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },		{ X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },
{ X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },		{ X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },
{ X86::PALIGNR128rr, X86::PALIGNR128rm, TB_ALIGN_16 },		{ X86::PALIGNR128rr, X86::PALIGNR128rm, TB_ALIGN_16 },
{ X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },		{ X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },
{ X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },		{ X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
{ X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },		{ X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
{ X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },		{ X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
		{ X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 },
{ X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },		{ X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
		{ X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 },
{ X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },		{ X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
{ X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },		{ X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
{ X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },		{ X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
{ X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },		{ X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },
{ X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },		{ X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },
{ X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },		{ X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
{ X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },		{ X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
{ X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },		{ X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
{ X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },		{ X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
{ X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },		{ X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
{ X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 },		{ X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 },
{ X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },		{ X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
{ X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },		{ X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },
{ X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },		{ X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
{ X86::PINSRWrri, X86::PINSRWrmi, TB_ALIGN_16 },		{ X86::PINSRBrr, X86::PINSRBrm, 0 },
		{ X86::PINSRDrr, X86::PINSRDrm, 0 },
		{ X86::PINSRQrr, X86::PINSRQrm, 0 },
		{ X86::PINSRWrri, X86::PINSRWrmi, 0 },
{ X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 },		{ X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 },
{ X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },		{ X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
{ X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },		{ X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
{ X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },		{ X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
{ X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },		{ X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
{ X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },		{ X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
{ X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },		{ X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
{ X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },		{ X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
Show All 21 Lines	static const X86OpTblEntry OpTbl2[] = {
{ X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },		{ X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
{ X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },		{ X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },
{ X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },		{ X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },
{ X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },		{ X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },
{ X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },		{ X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },
{ X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },		{ X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
{ X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },		{ X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
{ X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },		{ X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
		{ X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 },
{ X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },		{ X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
{ X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },		{ X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
		{ X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 },
		{ X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 },
{ X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },		{ X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
{ X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },		{ X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
{ X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },		{ X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
{ X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },		{ X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },
{ X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },		{ X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },
{ X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },		{ X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },
{ X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },		{ X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },
{ X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },		{ X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
▲ Show 20 Lines • Show All 116 Lines • ▼ Show 20 Lines	static const X86OpTblEntry OpTbl2[] = {
{ X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },		{ X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
{ X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },		{ X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
{ X86::VPADDWrr, X86::VPADDWrm, 0 },		{ X86::VPADDWrr, X86::VPADDWrm, 0 },
{ X86::VPALIGNR128rr, X86::VPALIGNR128rm, 0 },		{ X86::VPALIGNR128rr, X86::VPALIGNR128rm, 0 },
{ X86::VPANDNrr, X86::VPANDNrm, 0 },		{ X86::VPANDNrr, X86::VPANDNrm, 0 },
{ X86::VPANDrr, X86::VPANDrm, 0 },		{ X86::VPANDrr, X86::VPANDrm, 0 },
{ X86::VPAVGBrr, X86::VPAVGBrm, 0 },		{ X86::VPAVGBrr, X86::VPAVGBrm, 0 },
{ X86::VPAVGWrr, X86::VPAVGWrm, 0 },		{ X86::VPAVGWrr, X86::VPAVGWrm, 0 },
		{ X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
{ X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },		{ X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
		{ X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 },
{ X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },		{ X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
{ X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },		{ X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
{ X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },		{ X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
{ X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },		{ X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
{ X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },		{ X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
{ X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },		{ X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
{ X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },		{ X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
{ X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },		{ X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
{ X86::VPHADDDrr, X86::VPHADDDrm, 0 },		{ X86::VPHADDDrr, X86::VPHADDDrm, 0 },
{ X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 },		{ X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 },
{ X86::VPHADDWrr, X86::VPHADDWrm, 0 },		{ X86::VPHADDWrr, X86::VPHADDWrm, 0 },
{ X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },		{ X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
{ X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 },		{ X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 },
{ X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },		{ X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
{ X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },		{ X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
{ X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },		{ X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
		{ X86::VPINSRBrr, X86::VPINSRBrm, 0 },
		{ X86::VPINSRDrr, X86::VPINSRDrm, 0 },
		{ X86::VPINSRQrr, X86::VPINSRQrm, 0 },
{ X86::VPINSRWrri, X86::VPINSRWrmi, 0 },		{ X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
{ X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 },		{ X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 },
{ X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },		{ X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
{ X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },		{ X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
{ X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },		{ X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
{ X86::VPMINSWrr, X86::VPMINSWrm, 0 },		{ X86::VPMINSWrr, X86::VPMINSWrm, 0 },
{ X86::VPMINUBrr, X86::VPMINUBrm, 0 },		{ X86::VPMINUBrr, X86::VPMINUBrm, 0 },
{ X86::VPMINSBrr, X86::VPMINSBrm, 0 },		{ X86::VPMINSBrr, X86::VPMINSBrm, 0 },
Show All 22 Lines	static const X86OpTblEntry OpTbl2[] = {
{ X86::VPSLLWrr, X86::VPSLLWrm, 0 },		{ X86::VPSLLWrr, X86::VPSLLWrm, 0 },
{ X86::VPSRADrr, X86::VPSRADrm, 0 },		{ X86::VPSRADrr, X86::VPSRADrm, 0 },
{ X86::VPSRAWrr, X86::VPSRAWrm, 0 },		{ X86::VPSRAWrr, X86::VPSRAWrm, 0 },
{ X86::VPSRLDrr, X86::VPSRLDrm, 0 },		{ X86::VPSRLDrr, X86::VPSRLDrm, 0 },
{ X86::VPSRLQrr, X86::VPSRLQrm, 0 },		{ X86::VPSRLQrr, X86::VPSRLQrm, 0 },
{ X86::VPSRLWrr, X86::VPSRLWrm, 0 },		{ X86::VPSRLWrr, X86::VPSRLWrm, 0 },
{ X86::VPSUBBrr, X86::VPSUBBrm, 0 },		{ X86::VPSUBBrr, X86::VPSUBBrm, 0 },
{ X86::VPSUBDrr, X86::VPSUBDrm, 0 },		{ X86::VPSUBDrr, X86::VPSUBDrm, 0 },
		{ X86::VPSUBQrr, X86::VPSUBQrm, 0 },
{ X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },		{ X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
{ X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },		{ X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
		{ X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 },
		{ X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 },
{ X86::VPSUBWrr, X86::VPSUBWrm, 0 },		{ X86::VPSUBWrr, X86::VPSUBWrm, 0 },
{ X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },		{ X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
{ X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },		{ X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
{ X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },		{ X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
{ X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },		{ X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
{ X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },		{ X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
{ X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },		{ X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
{ X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },		{ X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
▲ Show 20 Lines • Show All 4,839 Lines • Show Last 20 Lines

test/CodeGen/X86/2011-11-30-or.ll

	; RUN: llc < %s -march=x86-64 -mcpu=corei7 \| FileCheck %s			; RUN: llc < %s -march=x86-64 -mcpu=corei7 \| FileCheck %s

	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
	target triple = "x86_64-apple-macosx10.6.6"			target triple = "x86_64-apple-macosx10.6.6"

	; Test that the order of operands is correct			; Test that the order of operands is correct
	; CHECK: select_func			; CHECK: select_func
	; CHECK: pblendvb %xmm1, %xmm2			; CHECK: pblendvb {{LCPI0_[0-9]*}}(%rip), %xmm1
	; CHECK: ret			; CHECK: ret

	define void @select_func(<8 x i16> %in) {			define void @select_func(<8 x i16> %in) {
	entry:			entry:
	%c.lobit.i.i.i = ashr <8 x i16> %in, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>			%c.lobit.i.i.i = ashr <8 x i16> %in, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
	%and.i56.i.i.i = and <8 x i16> %c.lobit.i.i.i, <i16 25, i16 8, i16 65, i16 25, i16 8, i16 95, i16 15, i16 45>			%and.i56.i.i.i = and <8 x i16> %c.lobit.i.i.i, <i16 25, i16 8, i16 65, i16 25, i16 8, i16 95, i16 15, i16 45>
	%and.i5.i.i.i = bitcast <8 x i16> %and.i56.i.i.i to <2 x i64>			%and.i5.i.i.i = bitcast <8 x i16> %and.i56.i.i.i to <2 x i64>
	%neg.i.i.i.i = xor <8 x i16> %c.lobit.i.i.i, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>			%neg.i.i.i.i = xor <8 x i16> %c.lobit.i.i.i, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
	Show All 9 Lines

test/CodeGen/X86/psubus.ll

Show All 20 Lines	vector.body: ; preds = %vector.body, %vector.ph
%index.next = add i64 %index, 8		%index.next = add i64 %index, 8
%6 = icmp eq i64 %index.next, 16384		%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body		br i1 %6, label %for.end, label %vector.body

for.end: ; preds = %vector.body		for.end: ; preds = %vector.body
ret void		ret void

; SSE2: @test1		; SSE2: @test1
; SSE2: psubusw LCPI0_0(%rip), %xmm0		; SSE2: psubusw %xmm0, %xmm1
		qcolombetUnsubmitted Not Done Reply Inline Actions Why is this not folded anymore? qcolombet: Why is this not folded anymore?
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions Oddly the addition of the folding patterns has allowed the load of the constant (to %xmm0) to be pulled out of the loop. RKSimon: Oddly the addition of the folding patterns has allowed the load of the constant (to %xmm0) to…
		qcolombetUnsubmitted Not Done Reply Inline Actions Thanks for checking. Could we get rid of the loop to have a test on the folding? qcolombet: Thanks for checking. Could we get rid of the loop to have a test on the folding?

; AVX1: @test1		; AVX1: @test1
; AVX1: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0		; AVX1: vpsubusw %xmm0, %xmm1, %xmm1

; AVX2: @test1		; AVX2: @test1
; AVX2: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0		; AVX2: vpsubusw %xmm0, %xmm1, %xmm1
}		}

define void @test2(i16* nocapture %head) nounwind {		define void @test2(i16* nocapture %head) nounwind {
vector.ph:		vector.ph:
br label %vector.body		br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph		vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]		%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i16* %head, i64 %index		%0 = getelementptr inbounds i16* %head, i64 %index
%1 = bitcast i16* %0 to <8 x i16>*		%1 = bitcast i16* %0 to <8 x i16>*
%2 = load <8 x i16>* %1, align 2		%2 = load <8 x i16>* %1, align 2
%3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>		%3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
%4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>		%4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
%5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer		%5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
store <8 x i16> %5, <8 x i16>* %1, align 2		store <8 x i16> %5, <8 x i16>* %1, align 2
%index.next = add i64 %index, 8		%index.next = add i64 %index, 8
%6 = icmp eq i64 %index.next, 16384		%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body		br i1 %6, label %for.end, label %vector.body

for.end: ; preds = %vector.body		for.end: ; preds = %vector.body
ret void		ret void

; SSE2: @test2		; SSE2: @test2
; SSE2: psubusw LCPI1_0(%rip), %xmm0		; SSE2: psubusw %xmm0, %xmm1

; AVX1: @test2		; AVX1: @test2
; AVX1: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0		; AVX1: vpsubusw %xmm0, %xmm1, %xmm1

; AVX2: @test2		; AVX2: @test2
; AVX2: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0		; AVX2: vpsubusw %xmm0, %xmm1, %xmm1
}		}

define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {		define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
vector.ph:		vector.ph:
%0 = insertelement <8 x i16> undef, i16 %w, i32 0		%0 = insertelement <8 x i16> undef, i16 %w, i32 0
%broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer		%broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
br label %vector.body		br label %vector.body

Show All 39 Lines	vector.body: ; preds = %vector.body, %vector.ph
%index.next = add i64 %index, 16		%index.next = add i64 %index, 16
%6 = icmp eq i64 %index.next, 16384		%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body		br i1 %6, label %for.end, label %vector.body

for.end: ; preds = %vector.body		for.end: ; preds = %vector.body
ret void		ret void

; SSE2: @test4		; SSE2: @test4
; SSE2: psubusb LCPI3_0(%rip), %xmm0		; SSE2: psubusb %xmm0, %xmm1

; AVX1: @test4		; AVX1: @test4
; AVX1: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0		; AVX1: vpsubusb %xmm0, %xmm1, %xmm1

; AVX2: @test4		; AVX2: @test4
; AVX2: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0		; AVX2: vpsubusb %xmm0, %xmm1, %xmm1
}		}

define void @test5(i8* nocapture %head) nounwind {		define void @test5(i8* nocapture %head) nounwind {
vector.ph:		vector.ph:
br label %vector.body		br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph		vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]		%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i8* %head, i64 %index		%0 = getelementptr inbounds i8* %head, i64 %index
%1 = bitcast i8* %0 to <16 x i8>*		%1 = bitcast i8* %0 to <16 x i8>*
%2 = load <16 x i8>* %1, align 1		%2 = load <16 x i8>* %1, align 1
%3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>		%3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
%4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>		%4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
%5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer		%5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
store <16 x i8> %5, <16 x i8>* %1, align 1		store <16 x i8> %5, <16 x i8>* %1, align 1
%index.next = add i64 %index, 16		%index.next = add i64 %index, 16
%6 = icmp eq i64 %index.next, 16384		%6 = icmp eq i64 %index.next, 16384
br i1 %6, label %for.end, label %vector.body		br i1 %6, label %for.end, label %vector.body

for.end: ; preds = %vector.body		for.end: ; preds = %vector.body
ret void		ret void

; SSE2: @test5		; SSE2: @test5
; SSE2: psubusb LCPI4_0(%rip), %xmm0		; SSE2: psubusb %xmm0, %xmm1

; AVX1: @test5		; AVX1: @test5
; AVX1: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0		; AVX1: vpsubusb %xmm0, %xmm1, %xmm1

; AVX2: @test5		; AVX2: @test5
; AVX2: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0		; AVX2: vpsubusb %xmm0, %xmm1, %xmm1
}		}

define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {		define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
vector.ph:		vector.ph:
%0 = insertelement <16 x i8> undef, i8 %w, i32 0		%0 = insertelement <16 x i8> undef, i8 %w, i32 0
%broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer		%broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
br label %vector.body		br label %vector.body

▲ Show 20 Lines • Show All 175 Lines • Show Last 20 Lines

test/CodeGen/X86/stack-folding-int-avx1.ll

; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+aes < %s \| FileCheck %s		; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+aes,+pclmul < %s \| FileCheck %s

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"		target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"		target triple = "x86_64-unknown-unknown"

; Stack reload folding tests.		; Stack reload folding tests.
;		;
; By including a nop call with sideeffects we can force a partial register spill of the		; By including a nop call with sideeffects we can force a partial register spill of the
; relevant registers and check that the reload is correctly folded into the instruction.		; relevant registers and check that the reload is correctly folded into the instruction.
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
;CHECK-LABEL: stack_fold_aeskeygenassist		;CHECK-LABEL: stack_fold_aeskeygenassist
;CHECK: vaeskeygenassist $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: vaeskeygenassist $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)		%2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
ret <2 x i64> %2		ret <2 x i64> %2
}		}
declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone		declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone

		define <4 x i32> @stack_fold_movd_load(i32 %a0) {
		;CHECK-LABEL: stack_fold_movd_load
		;CHECK: movd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 4-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		%2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0
		ret <4 x i32> %2
		}

		define i32 @stack_fold_movd_store(<4 x i32> %a0) {
		;CHECK-LABEL: stack_fold_movd_store
		;CHECK: movd {{%xmm[0-9][0-9]}}, {{-?[0-9]}}(%rsp) {{.*#+}} 4-byte Folded Spill
		%1 = extractelement <4 x i32> %a0, i32 0
		%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		ret i32 %1
		}

		define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
		;CHECK-LABEL: stack_fold_movq_load
		;CHECK: movq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
		ret <2 x i64> %2
		}

		define i64 @stack_fold_movq_store(<2 x i64> %a0) {
		;CHECK-LABEL: stack_fold_movq_store
		;CHECK: movq {{%xmm[0-9][0-9]}}, {{-?[0-9]}}(%rsp) {{.*#+}} 8-byte Folded Spill
		%1 = extractelement <2 x i64> %a0, i32 0
		%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		ret i64 %1
		}

define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {		define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pabsb		;CHECK-LABEL: stack_fold_pabsb
;CHECK: vpabsb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: vpabsb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)		%2 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)
ret <16 x i8> %2		ret <16 x i8> %2
}		}
declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone		declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
▲ Show 20 Lines • Show All 162 Lines • ▼ Show 20 Lines	define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw		;CHECK-LABEL: stack_fold_pavgw
;CHECK: vpavgw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpavgw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)		%2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone

; TODO stack_fold_pblendvb		define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
		;CHECK-LABEL: stack_fold_pblendvb
		;CHECK: vpblendvb {{%xmm[0-9][0-9]}}, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
		ret <16 x i8> %2
		}
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone		declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone

define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {		define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pblendw		;CHECK-LABEL: stack_fold_pblendw
;CHECK: vpblendw $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpblendw $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7)		%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone		declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone

; TODO stack_fold_pclmulqdq		define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
		;CHECK-LABEL: stack_fold_pclmulqdq
		;CHECK: vpclmulqdq $0, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone		declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone

define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {		define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pcmpeqb		;CHECK-LABEL: stack_fold_pcmpeqb
;CHECK: vpcmpeqb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpcmpeqb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp eq <16 x i8> %a0, %a1		%2 = icmp eq <16 x i8> %a0, %a1
%3 = sext <16 x i1> %2 to <16 x i8>		%3 = sext <16 x i1> %2 to <16 x i8>
Show All 22 Lines	define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pcmpeqw		;CHECK-LABEL: stack_fold_pcmpeqw
;CHECK: vpcmpeqw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpcmpeqw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp eq <8 x i16> %a0, %a1		%2 = icmp eq <8 x i16> %a0, %a1
%3 = sext <8 x i1> %2 to <8 x i16>		%3 = sext <8 x i1> %2 to <8 x i16>
ret <8 x i16> %3		ret <8 x i16> %3
}		}

; TODO stack_fold_pcmpestri		define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {
		;CHECK-LABEL: stack_fold_pcmpestri
		;CHECK: vpcmpestri $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
		%2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
		ret i32 %2
		}
declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone		declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone

; TODO stack_fold_pcmpestrm		define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {
		;CHECK-LABEL: stack_fold_pcmpestrm
		;CHECK: vpcmpestrm $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
		%2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
		ret <16 x i8> %2
		}
declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone		declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone

define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {		define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pcmpgtb		;CHECK-LABEL: stack_fold_pcmpgtb
;CHECK: vpcmpgtb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpcmpgtb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <16 x i8> %a0, %a1		%2 = icmp sgt <16 x i8> %a0, %a1
%3 = sext <16 x i1> %2 to <16 x i8>		%3 = sext <16 x i1> %2 to <16 x i8>
Show All 22 Lines	define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pcmpgtw		;CHECK-LABEL: stack_fold_pcmpgtw
;CHECK: vpcmpgtw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpcmpgtw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <8 x i16> %a0, %a1		%2 = icmp sgt <8 x i16> %a0, %a1
%3 = sext <8 x i1> %2 to <8 x i16>		%3 = sext <8 x i1> %2 to <8 x i16>
ret <8 x i16> %3		ret <8 x i16> %3
}		}

; TODO stack_fold_pcmpistri		define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {
		;CHECK-LABEL: stack_fold_pcmpistri
		;CHECK: vpcmpistri $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
		ret i32 %2
		}
declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone		declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone

; TODO stack_fold_pcmpistrm		define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {
		;CHECK-LABEL: stack_fold_pcmpistrm
		;CHECK: vpcmpistrm $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
		ret <16 x i8> %2
		}
declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone		declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone

; TODO stack_fold_pextrb		; TODO stack_fold_pextrb
; TODO stack_fold_pextrd
; TODO stack_fold_pextrq		define i32 @stack_fold_pextrd(<4 x i32> %a0) {
		;CHECK-LABEL: stack_fold_pextrd
		;CHECK: pextrd $1, {{%xmm[0-9][0-9]}}, {{-?[0-9]}}(%rsp) {{.*#+}} 4-byte Folded Spill
		;CHECK: movl {{-?[0-9]}}(%rsp), %eax {{.#+}} 4-byte Reload
		%1 = extractelement <4 x i32> %a0, i32 1
		%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		ret i32 %1
		}

		define i64 @stack_fold_pextrq(<2 x i64> %a0) {
		;CHECK-LABEL: stack_fold_pextrq
		;CHECK: pextrq $1, {{%xmm[0-9][0-9]}}, {{-?[0-9]}}(%rsp) {{.*#+}} 8-byte Folded Spill
		;CHECK: movq {{-?[0-9]}}(%rsp), %rax {{.#+}} 8-byte Reload
		%1 = extractelement <2 x i64> %a0, i32 1
		%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		ret i64 %1
		}

; TODO stack_fold_pextrw		; TODO stack_fold_pextrw

define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {		define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_phaddd		;CHECK-LABEL: stack_fold_phaddd
;CHECK: vphaddd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vphaddd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)		%2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2		ret <4 x i32> %2
Show All 13 Lines	define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_phaddw		;CHECK-LABEL: stack_fold_phaddw
;CHECK: vphaddw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vphaddw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)		%2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone

; TODO stack_fold_phminposuw		define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {
		;CHECK-LABEL: stack_fold_phminposuw
		;CHECK: vphminposuw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)
		ret <8 x i16> %2
		}
declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone

define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {		define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_phsubd		;CHECK-LABEL: stack_fold_phsubd
;CHECK: vphsubd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vphsubd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)		%2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2		ret <4 x i32> %2
Show All 13 Lines	define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_phsubw		;CHECK-LABEL: stack_fold_phsubw
;CHECK: vphsubw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vphsubw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)		%2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone

; TODO stack_fold_pinsrb		define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
; TODO stack_fold_pinsrd		;CHECK-LABEL: stack_fold_pinsrb
; TODO stack_fold_pinsrq		;CHECK: vpinsrb $1, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 4-byte Folded Reload
; TODO stack_fold_pinsrw		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		%2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
		ret <16 x i8> %2
		}

		define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
		;CHECK-LABEL: stack_fold_pinsrd
		;CHECK: vpinsrd $1, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 4-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		%2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
		ret <4 x i32> %2
		}

		define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
		;CHECK-LABEL: stack_fold_pinsrq
		;CHECK: vpinsrq $1, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 8-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		%2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
		ret <2 x i64> %2
		}

		define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
		;CHECK-LABEL: stack_fold_pinsrw
		;CHECK: vpinsrw $1, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 4-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		%2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
		ret <8 x i16> %2
		}

define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {		define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pmaddubsw		;CHECK-LABEL: stack_fold_pmaddubsw
;CHECK: vpmaddubsw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpmaddubsw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)		%2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
▲ Show 20 Lines • Show All 111 Lines • ▼ Show 20 Lines	define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pminuw		;CHECK-LABEL: stack_fold_pminuw
;CHECK: vpminuw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpminuw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)		%2 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone

; TODO stack_fold_pmovsxbd		define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxbd
		;CHECK: vpmovsxbd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0)
		ret <4 x i32> %2
		}
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone		declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovsxbq		define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxbq
		;CHECK: pmovsxbq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovsxbw		define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxbw
		;CHECK: vpmovsxbw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0)
		ret <8 x i16> %2
		}
declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone		declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovsxdq		define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxdq
		;CHECK: vpmovsxdq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone

; TODO stack_fold_pmovsxwd		define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxwd
		;CHECK: vpmovsxwd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0)
		ret <4 x i32> %2
		}
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone		declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone

; TODO stack_fold_pmovsxwq		define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxwq
		;CHECK: vpmovsxwq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone

; TODO stack_fold_pmovzxbd		define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxbd
		;CHECK: vpmovzxbd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0)
		ret <4 x i32> %2
		}
declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone		declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovzxbq		define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxbq
		;CHECK: vpmovzxbq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovzxbw		define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxbw
		;CHECK: vpmovzxbw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0)
		ret <8 x i16> %2
		}
declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone		declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovzxdq		define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxdq
		;CHECK: vpmovzxdq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone

; TODO stack_fold_pmovzxwd		define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxwd
		;CHECK: vpmovzxwd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0)
		ret <4 x i32> %2
		}
declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone		declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone

; TODO stack_fold_pmovzxwq		define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxwq
		;CHECK: vpmovzxwq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone

define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {		define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_pmuldq		;CHECK-LABEL: stack_fold_pmuldq
;CHECK: vpmuldq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpmuldq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)		%2 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
ret <2 x i64> %2		ret <2 x i64> %2
▲ Show 20 Lines • Show All 214 Lines • ▼ Show 20 Lines
define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {		define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_psubd		;CHECK-LABEL: stack_fold_psubd
;CHECK: vpsubd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpsubd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = sub <4 x i32> %a0, %a1		%2 = sub <4 x i32> %a0, %a1
ret <4 x i32> %2		ret <4 x i32> %2
}		}

; TODO stack_fold_psubq		define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
		;CHECK-LABEL: stack_fold_psubq
		;CHECK: vpsubq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = sub <2 x i64> %a0, %a1
		ret <2 x i64> %2
		}

define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {		define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_psubsb		;CHECK-LABEL: stack_fold_psubsb
;CHECK: vpsubsb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpsubsb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)		%2 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2		ret <16 x i8> %2
}		}
declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone		declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone

define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {		define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_psubsw		;CHECK-LABEL: stack_fold_psubsw
;CHECK: vpsubsw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpsubsw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)		%2 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone

; TODO stack_fold_psubusb		define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
		;CHECK-LABEL: stack_fold_psubusb
		;CHECK: vpsubusb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
		ret <16 x i8> %2
		}
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone		declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone

; TODO stack_fold_psubusw		define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
		;CHECK-LABEL: stack_fold_psubusw
		;CHECK: vpsubusw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
		ret <8 x i16> %2
		}
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone

define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {		define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_psubw		;CHECK-LABEL: stack_fold_psubw
;CHECK: vpsubw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpsubw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = sub <8 x i16> %a0, %a1		%2 = sub <8 x i16> %a0, %a1
ret <8 x i16> %2		ret <8 x i16> %2
}		}

; TODO stack_fold_ptest		define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {
		;CHECK-LABEL: stack_fold_ptest
		;CHECK: vptest {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
		ret i32 %2
		}
declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone		declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone

define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {		define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_punpckhbw		;CHECK-LABEL: stack_fold_punpckhbw
;CHECK: vpunpckhbw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload		;CHECK: vpunpckhbw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}}, {{%xmm[0-9][0-9]}} {{.#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>		%2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <16 x i8> %2		ret <16 x i8> %2
▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines

test/CodeGen/X86/stack-folding-int-sse42.ll

; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.2,+aes < %s \| FileCheck %s		; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.2,+aes,+pclmul < %s \| FileCheck %s

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"		target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"		target triple = "x86_64-unknown-unknown"

; Stack reload folding tests.		; Stack reload folding tests.
;		;
; By including a nop call with sideeffects we can force a partial register spill of the		; By including a nop call with sideeffects we can force a partial register spill of the
; relevant registers and check that the reload is correctly folded into the instruction.		; relevant registers and check that the reload is correctly folded into the instruction.
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
;CHECK-LABEL: stack_fold_aeskeygenassist		;CHECK-LABEL: stack_fold_aeskeygenassist
;CHECK: aeskeygenassist $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: aeskeygenassist $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)		%2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
ret <2 x i64> %2		ret <2 x i64> %2
}		}
declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone		declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone

		define <4 x i32> @stack_fold_movd_load(i32 %a0) {
		;CHECK-LABEL: stack_fold_movd_load
		;CHECK: movd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 4-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		%2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0
		ret <4 x i32> %2
		}

		define i32 @stack_fold_movd_store(<4 x i32> %a0) {
		;CHECK-LABEL: stack_fold_movd_store
		;CHECK: movd {{%xmm[0-9][0-9]}}, {{-?[0-9]}}(%rsp) {{.*#+}} 4-byte Folded Spill
		%1 = extractelement <4 x i32> %a0, i32 0
		%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		ret i32 %1
		}

		define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
		;CHECK-LABEL: stack_fold_movq_load
		;CHECK: movq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
		ret <2 x i64> %2
		}

		define i64 @stack_fold_movq_store(<2 x i64> %a0) {
		;CHECK-LABEL: stack_fold_movq_store
		;CHECK: movq {{%xmm[0-9][0-9]}}, {{-?[0-9]}}(%rsp) {{.*#+}} 8-byte Folded Spill
		%1 = extractelement <2 x i64> %a0, i32 0
		%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		ret i64 %1
		}

define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {		define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pabsb		;CHECK-LABEL: stack_fold_pabsb
;CHECK: pabsb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: pabsb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)		%2 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)
ret <16 x i8> %2		ret <16 x i8> %2
}		}
declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone		declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
▲ Show 20 Lines • Show All 162 Lines • ▼ Show 20 Lines	define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw		;CHECK-LABEL: stack_fold_pavgw
;CHECK: pavgw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: pavgw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)		%2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone

; TODO stack_fold_pblendvb		define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
		;CHECK-LABEL: stack_fold_pblendvb
		;CHECK: pblendvb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
		ret <16 x i8> %2
		}
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone		declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone

define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {		define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pblendw		;CHECK-LABEL: stack_fold_pblendw
;CHECK: pblendw $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: pblendw $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7)		%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone		declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone

; TODO stack_fold_pclmulqdq		define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
		;CHECK-LABEL: stack_fold_pclmulqdq
		;CHECK: pclmulqdq $0, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone		declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone

define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {		define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pcmpeqb		;CHECK-LABEL: stack_fold_pcmpeqb
;CHECK: pcmpeqb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: pcmpeqb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp eq <16 x i8> %a0, %a1		%2 = icmp eq <16 x i8> %a0, %a1
%3 = sext <16 x i1> %2 to <16 x i8>		%3 = sext <16 x i1> %2 to <16 x i8>
Show All 22 Lines	define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pcmpeqw		;CHECK-LABEL: stack_fold_pcmpeqw
;CHECK: pcmpeqw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: pcmpeqw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp eq <8 x i16> %a0, %a1		%2 = icmp eq <8 x i16> %a0, %a1
%3 = sext <8 x i1> %2 to <8 x i16>		%3 = sext <8 x i1> %2 to <8 x i16>
ret <8 x i16> %3		ret <8 x i16> %3
}		}

; TODO stack_fold_pcmpestri		define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {
		;CHECK-LABEL: stack_fold_pcmpestri
		;CHECK: pcmpestri $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
		%2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
		ret i32 %2
		}
declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone		declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone

; TODO stack_fold_pcmpestrm		define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {
		;CHECK-LABEL: stack_fold_pcmpestrm
		;CHECK: pcmpestrm $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
		%2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
		ret <16 x i8> %2
		}
declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone		declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone

define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {		define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pcmpgtb		;CHECK-LABEL: stack_fold_pcmpgtb
;CHECK: pcmpgtb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: pcmpgtb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <16 x i8> %a0, %a1		%2 = icmp sgt <16 x i8> %a0, %a1
%3 = sext <16 x i1> %2 to <16 x i8>		%3 = sext <16 x i1> %2 to <16 x i8>
Show All 22 Lines	define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pcmpgtw		;CHECK-LABEL: stack_fold_pcmpgtw
;CHECK: pcmpgtw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: pcmpgtw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <8 x i16> %a0, %a1		%2 = icmp sgt <8 x i16> %a0, %a1
%3 = sext <8 x i1> %2 to <8 x i16>		%3 = sext <8 x i1> %2 to <8 x i16>
ret <8 x i16> %3		ret <8 x i16> %3
}		}

; TODO stack_fold_pcmpistri		define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {
		;CHECK-LABEL: stack_fold_pcmpistri
		;CHECK: pcmpistri $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
		ret i32 %2
		}
declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone		declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone

; TODO stack_fold_pcmpistrm		define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {
		;CHECK-LABEL: stack_fold_pcmpistrm
		;CHECK: pcmpistrm $7, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
		ret <16 x i8> %2
		}
declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone		declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone

; TODO stack_fold_pextrb		; TODO stack_fold_pextrb
; TODO stack_fold_pextrd
; TODO stack_fold_pextrq		define i32 @stack_fold_pextrd(<4 x i32> %a0) {
		;CHECK-LABEL: stack_fold_pextrd
		;CHECK: pextrd $1, {{%xmm[0-9][0-9]}}, {{-?[0-9]}}(%rsp) {{.*#+}} 4-byte Folded Spill
		;CHECK: movl {{-?[0-9]}}(%rsp), %eax {{.#+}} 4-byte Reload
		%1 = extractelement <4 x i32> %a0, i32 1
		%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		ret i32 %1
		}

		define i64 @stack_fold_pextrq(<2 x i64> %a0) {
		;CHECK-LABEL: stack_fold_pextrq
		;CHECK: pextrq $1, {{%xmm[0-9][0-9]}}, {{-?[0-9]}}(%rsp) {{.*#+}} 8-byte Folded Spill
		;CHECK: movq {{-?[0-9]}}(%rsp), %rax {{.#+}} 8-byte Reload
		%1 = extractelement <2 x i64> %a0, i32 1
		%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		ret i64 %1
		}

; TODO stack_fold_pextrw		; TODO stack_fold_pextrw

define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {		define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_phaddd		;CHECK-LABEL: stack_fold_phaddd
;CHECK: phaddd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: phaddd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)		%2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2		ret <4 x i32> %2
Show All 13 Lines	define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_phaddw		;CHECK-LABEL: stack_fold_phaddw
;CHECK: phaddw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: phaddw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)		%2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone

; TODO stack_fold_phminposuw		define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {
		;CHECK-LABEL: stack_fold_phminposuw
		;CHECK: phminposuw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)
		ret <8 x i16> %2
		}
declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone

define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {		define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_phsubd		;CHECK-LABEL: stack_fold_phsubd
;CHECK: phsubd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: phsubd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)		%2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2		ret <4 x i32> %2
Show All 13 Lines	define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_phsubw		;CHECK-LABEL: stack_fold_phsubw
;CHECK: phsubw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: phsubw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)		%2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone

; TODO stack_fold_pinsrb		define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
; TODO stack_fold_pinsrd		;CHECK-LABEL: stack_fold_pinsrb
; TODO stack_fold_pinsrq		;CHECK: pinsrb $1, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 4-byte Folded Reload
; TODO stack_fold_pinsrw		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		%2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
		ret <16 x i8> %2
		}

		define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
		;CHECK-LABEL: stack_fold_pinsrd
		;CHECK: pinsrd $1, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 4-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		%2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
		ret <4 x i32> %2
		}

		define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
		;CHECK-LABEL: stack_fold_pinsrq
		;CHECK: pinsrq $1, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 8-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		%2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
		ret <2 x i64> %2
		}

		define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
		;CHECK-LABEL: stack_fold_pinsrw
		;CHECK: pinsrw $1, {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 4-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
		%2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
		ret <8 x i16> %2
		}

define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {		define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pmaddubsw		;CHECK-LABEL: stack_fold_pmaddubsw
;CHECK: pmaddubsw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: pmaddubsw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)		%2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
▲ Show 20 Lines • Show All 111 Lines • ▼ Show 20 Lines	define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pminuw		;CHECK-LABEL: stack_fold_pminuw
;CHECK: pminuw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: pminuw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)		%2 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone

; TODO stack_fold_pmovsxbd		define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxbd
		;CHECK: pmovsxbd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0)
		ret <4 x i32> %2
		}
declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone		declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovsxbq		define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxbq
		;CHECK: pmovsxbq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovsxbw		define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxbw
		;CHECK: pmovsxbw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0)
		ret <8 x i16> %2
		}
declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone		declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovsxdq		define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxdq
		;CHECK: pmovsxdq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone

; TODO stack_fold_pmovsxwd		define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxwd
		;CHECK: pmovsxwd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0)
		ret <4 x i32> %2
		}
declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone		declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone

; TODO stack_fold_pmovsxwq		define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
		;CHECK-LABEL: stack_fold_pmovsxwq
		;CHECK: pmovsxwq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone

; TODO stack_fold_pmovzxbd		define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxbd
		;CHECK: pmovzxbd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0)
		ret <4 x i32> %2
		}
declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone		declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovzxbq		define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxbq
		;CHECK: pmovzxbq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovzxbw		define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxbw
		;CHECK: pmovzxbw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0)
		ret <8 x i16> %2
		}
declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone		declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone

; TODO stack_fold_pmovzxdq		define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxdq
		;CHECK: pmovzxdq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone

; TODO stack_fold_pmovzxwd		define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxwd
		;CHECK: pmovzxwd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0)
		ret <4 x i32> %2
		}
declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone		declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone

; TODO stack_fold_pmovzxwq		define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
		;CHECK-LABEL: stack_fold_pmovzxwq
		;CHECK: pmovzxwq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0)
		ret <2 x i64> %2
		}
declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone		declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone

define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {		define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_pmuldq		;CHECK-LABEL: stack_fold_pmuldq
;CHECK: pmuldq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: pmuldq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)		%2 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
ret <2 x i64> %2		ret <2 x i64> %2
▲ Show 20 Lines • Show All 214 Lines • ▼ Show 20 Lines
define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {		define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_psubd		;CHECK-LABEL: stack_fold_psubd
;CHECK: psubd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: psubd {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = sub <4 x i32> %a0, %a1		%2 = sub <4 x i32> %a0, %a1
ret <4 x i32> %2		ret <4 x i32> %2
}		}

; TODO stack_fold_psubq		define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
		;CHECK-LABEL: stack_fold_psubq
		;CHECK: psubq {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = sub <2 x i64> %a0, %a1
		ret <2 x i64> %2
		}

define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {		define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_psubsb		;CHECK-LABEL: stack_fold_psubsb
;CHECK: psubsb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: psubsb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)		%2 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2		ret <16 x i8> %2
}		}
declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone		declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone

define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {		define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_psubsw		;CHECK-LABEL: stack_fold_psubsw
;CHECK: psubsw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: psubsw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)		%2 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2		ret <8 x i16> %2
}		}
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone

; TODO stack_fold_psubusb		define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
		;CHECK-LABEL: stack_fold_psubusb
		;CHECK: psubusb {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
		ret <16 x i8> %2
		}
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone		declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone

; TODO stack_fold_psubusw		define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
		;CHECK-LABEL: stack_fold_psubusw
		;CHECK: psubusw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
		ret <8 x i16> %2
		}
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone		declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone

define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {		define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_psubw		;CHECK-LABEL: stack_fold_psubw
;CHECK: psubw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: psubw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = sub <8 x i16> %a0, %a1		%2 = sub <8 x i16> %a0, %a1
ret <8 x i16> %2		ret <8 x i16> %2
}		}

; TODO stack_fold_ptest		define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {
		;CHECK-LABEL: stack_fold_ptest
		;CHECK: ptest {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
		%2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
		ret i32 %2
		}
declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone		declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone

define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {		define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_punpckhbw		;CHECK-LABEL: stack_fold_punpckhbw
;CHECK: punpckhbw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload		;CHECK: punpckhbw {{-?[0-9]}}(%rsp), {{%xmm[0-9][0-9]}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()		%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>		%2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <16 x i8> %2		ret <16 x i8> %2
▲ Show 20 Lines • Show All 67 Lines • Show Last 20 Lines