This is an archive of the discontinued LLVM Phabricator instance.

Differential D3948

CSE removes COPY.
Needs RevisionPublic

Authored by Danil on May 28 2014, 11:33 PM.

Download Raw Diff

Details

Reviewers

atrick

Summary

The reason why CSE ignored COPY is PerformTrivialCoalescing.

If COPY insert into HASH and perform PerformTrivialCoalescing: MI->eraseFromParent()
HASH will be broken. Main idea is to perform PerformTrivialCoalescing before insertion.

cross-regclass copy:

We develop backend for new architecture with independent GPR and ADDRRegs classes.
In this case we have redundant MOVs(between independent disjoint reg classes):
move r0.l, a0.l <<<<<<<<<<<<
store r2.d, (a0.l)
move r0.l, a0.l <<<<<<<<<<<<
store r4.d, (a0.l+8)
Now CSE removes this type of MOV.

subreg copy: CodeGen\X86\cse-add-with-overflow.ll passed

because we got CSE for second "add" and correspondent COPYs
and "Simple Register Coalescing" performed coalescing for
first "add" and correspondent COPYs (This scheme seems natural,
so I removed FIXME in PerformTrivialCoalescing):

    //--------   dump using patch: -print-after-all:
    # *** IR Dump After Machine Loop Invariant Code Motion ***:
    # Machine code for function redundantadd: SSA
    Function Live Ins: %RDI in %vreg2, %RSI in %vreg3

    BB#0: derived from LLVM BB %entry
      Live Ins: %RDI %RSI
    %vreg3<def> = COPY %RSI; GR64:%vreg3
    %vreg2<def> = COPY %RDI; GR64:%vreg2
    %vreg0<def> = MOV64rm %vreg2, 1, %noreg, 0, %noreg; mem:LD8[%a0] GR64:%vreg0,%vreg2
    %vreg1<def> = MOV64rm %vreg3, 1, %noreg, 0, %noreg; mem:LD8[%a1] GR64:%vreg1,%vreg3
    %vreg4<def> = COPY %vreg0:sub_32bit; GR32:%vreg4 GR64:%vreg0
    %vreg5<def> = COPY %vreg1:sub_32bit; GR32:%vreg5 GR64:%vreg1
    %vreg6<def,tied1> = ADD32rr %vreg4<tied0>, %vreg5<kill>, %EFLAGS<imp-def>; GR32:%vreg6,%vreg4,%vreg5
    JNO_4 <BB#2>, %EFLAGS<imp-use>
    JMP_4 <BB#1>
    Successors according to CFG: BB#1(1) BB#2(1048575)

    BB#1: derived from LLVM BB %exit2
    Predecessors according to CFG: BB#0

    BB#2: derived from LLVM BB %return
    Predecessors according to CFG: BB#0
    >>>>>>>>>%vreg7<def> = COPY %vreg0:sub_32bit; GR32:%vreg7 GR64:%vreg0
    >>>>>>>>>%vreg8<def> = COPY %vreg1:sub_32bit; GR32:%vreg8 GR64:%vreg1
    >>>>>>>>>%vreg9<def,tied1> = ADD32rr %vreg8<tied0>, %vreg7<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg9,%vreg8,%vreg7
    %vreg10<def> = SUBREG_TO_REG 0, %vreg9<kill>, 4; GR64:%vreg10 GR32:%vreg9
    %RAX<def> = COPY %vreg10; GR64:%vreg10
    RETQ %RAX

    # End machine code for function redundantadd.

    # *** IR Dump After Machine Common Subexpression Elimination ***:
    # Machine code for function redundantadd: SSA
    Function Live Ins: %RDI in %vreg2, %RSI in %vreg3

    BB#0: derived from LLVM BB %entry
    Live Ins: %RDI %RSI
    %vreg3<def> = COPY %RSI; GR64:%vreg3
    %vreg2<def> = COPY %RDI; GR64:%vreg2
    %vreg0<def> = MOV64rm %vreg2, 1, %noreg, 0, %noreg; mem:LD8[%a0] GR64:%vreg0,%vreg2
    %vreg1<def> = MOV64rm %vreg3, 1, %noreg, 0, %noreg; mem:LD8[%a1] GR64:%vreg1,%vreg3
    %vreg4<def> = COPY %vreg0:sub_32bit; GR32:%vreg4 GR64:%vreg0
    %vreg5<def> = COPY %vreg1:sub_32bit; GR32:%vreg5 GR64:%vreg1
    %vreg6<def,tied1> = ADD32rr %vreg4<tied0>, %vreg5, %EFLAGS<imp-def>; GR32:%vreg6,%vreg4,%vreg5
    JNO_4 <BB#2>, %EFLAGS<imp-use>
    JMP_4 <BB#1>
    Successors according to CFG: BB#1(1) BB#2(1048575)

    BB#1: derived from LLVM BB %exit2
    Predecessors according to CFG: BB#0

    BB#2: derived from LLVM BB %return
    Predecessors according to CFG: BB#0
    %vreg10<def> = SUBREG_TO_REG 0, %vreg6, 4; GR64:%vreg10 GR32:%vreg6
    %RAX<def> = COPY %vreg10; GR64:%vreg10
    RETQ %RAX

    # End machine code for function redundantadd.
    .........................................
    .........................................
    # *** IR Dump After Live Interval Analysis ***:
    # Machine code for function redundantadd: Post SSA
    Function Live Ins: %RDI in %vreg2, %RSI in %vreg3

    0B    BB#0: derived from LLVM BB %entry
	    Live Ins: %RDI %RSI
    16B		%vreg3<def> = COPY %RSI; GR64:%vreg3
    32B		%vreg2<def> = COPY %RDI; GR64:%vreg2
    48B		%vreg0<def> = MOV64rm %vreg2, 1, %noreg, 0, %noreg; mem:LD8[%a0] GR64:%vreg0,%vreg2
    64B		%vreg1<def> = MOV64rm %vreg3, 1, %noreg, 0, %noreg; mem:LD8[%a1] GR64:%vreg1,%vreg3
    >>>>>>>>>>80B		%vreg4<def> = COPY %vreg0:sub_32bit; GR32:%vreg4 GR64:%vreg0
    >>>>>>>>>>96B		%vreg5<def> = COPY %vreg1:sub_32bit; GR32:%vreg5 GR64:%vreg1
    >>>>>>>>>>112B		%vreg6<def> = COPY %vreg5; GR32:%vreg6,%vreg5
    128B		%vreg6<def,tied1> = ADD32rr %vreg6<tied0>, %vreg4, %EFLAGS<imp-def>; GR32:%vreg6,%vreg4
    144B		JNO_4 <BB#2>, %EFLAGS<imp-use,kill>
    160B		JMP_4 <BB#1>
    Successors according to CFG: BB#1(1) BB#2(1048575)

    176B	BB#1: derived from LLVM BB %exit2
    Predecessors according to CFG: BB#0

    192B	BB#2: derived from LLVM BB %return
    Predecessors according to CFG: BB#0
    208B		%vreg10<def> = SUBREG_TO_REG 0, %vreg6, 4; GR64:%vreg10 GR32:%vreg6
    224B		%RAX<def> = COPY %vreg10; GR64:%vreg10
    240B		RETQ %RAX<kill>

    # End machine code for function redundantadd.

    # *** IR Dump After Simple Register Coalescing ***:
    # Machine code for function redundantadd: Post SSA
    Function Live Ins: %RDI in %vreg2, %RSI in %vreg3

    0B	BB#0: derived from LLVM BB %entry
    Live Ins: %RDI %RSI
    16B		%vreg3<def> = COPY %RSI; GR64:%vreg3
    32B		%vreg2<def> = COPY %RDI; GR64:%vreg2
    48B		%vreg0<def> = MOV64rm %vreg2, 1, %noreg, 0, %noreg; mem:LD8[%a0] GR64_with_sub_8bit:%vreg0 GR64:%vreg2
    64B		%vreg10<def> = MOV64rm %vreg3, 1, %noreg, 0, %noreg; mem:LD8[%a1] GR64_with_sub_8bit:%vreg10 GR64:%vreg3
    128B		%vreg10:sub_32bit<def,tied1> = ADD32rr %vreg10:sub_32bit<tied0>, %vreg0:sub_32bit, %EFLAGS<imp-def>; GR64_with_sub_8bit:%vreg10,%vreg0
    144B		JNO_4 <BB#2>, %EFLAGS<imp-use,kill>
    160B		JMP_4 <BB#1>
    Successors according to CFG: BB#1(1) BB#2(1048575)

    176B	BB#1: derived from LLVM BB %exit2
    Predecessors according to CFG: BB#0

    192B	BB#2: derived from LLVM BB %return
    Predecessors according to CFG: BB#0
    224B		%RAX<def> = COPY %vreg10; GR64_with_sub_8bit:%vreg10
    240B		RETQ %RAX<kill>

    # End machine code for function redundantadd.

Also I modified cse-add-with-overflow.ll to minimize test.

problem with CodeGen/X86/inline-asm-fpstack.ll: //-------- dump: -print-after-all:
1. * IR Dump After Machine Loop Invariant Code Motion *:
2. Machine code for function testPR4185b: Post SSA Constant Pool: cp#0: 1.000000e+06, align=4
  
  BB#0: derived from LLVM BB %return %FP0<def> = LD_Fp32m80 %noreg, 1, %noreg, <cp#0>, %noreg, %FPSW<imp-def,dead>; mem:LD4[ConstantPool] >>>>>>>%ST0<def> = COPY %FP0 INLINEASM <es:fistl $0> [sideeffect] [attdialect], $0:[reguse], %ST0 >>>>>>>%ST0<def> = COPY %FP0<kill> INLINEASM <es:fistpl $0> [sideeffect] [attdialect], $0:[reguse], %ST0, $1:[clobber], %ST0<earlyclobber,imp-def> RETL
3. End machine code for function testPR4185b.
4. * IR Dump After Prologue/Epilogue Insertion & Frame Finalization *:
5. Machine code for function testPR4185b: Post SSA Constant Pool: cp#0: 1.000000e+06, align=4
  
  BB#0: derived from LLVM BB %return LD_F32m %noreg, 1, %noreg, <cp#0>, %noreg, %FPSW<imp-def,dead>; mem:LD4[ConstantPool] INLINEASM <es:fistl $0> [sideeffect] [attdialect], $0:[reguse], %ST0 INLINEASM <es:fistpl $0> [sideeffect] [attdialect], $0:[reguse], %ST0, $1:[clobber], %ST0<earlyclobber,imp-def> RETL
6. End machine code for function testPR4185b.
  
  //-------- dump using patch:
7. * IR Dump After Machine Loop Invariant Code Motion *:
8. Machine code for function testPR4185b: Post SSA Constant Pool: cp#0: 1.000000e+06, align=4
  
  BB#0: derived from LLVM BB %return %FP0<def> = LD_Fp32m80 %noreg, 1, %noreg, <cp#0>, %noreg, %FPSW<imp-def,dead>; mem:LD4[ConstantPool] >>>>>>>>>%ST0<def> = COPY %FP0<kill> INLINEASM <es:fistl $0> [sideeffect] [attdialect], $0:[reguse], %ST0 INLINEASM <es:fistpl $0> [sideeffect] [attdialect], $0:[reguse], %ST0, $1:[clobber], %ST0<earlyclobber,imp-def> RETL
9. End machine code for function testPR4185b.
10. * IR Dump After Prologue/Epilogue Insertion & Frame Finalization *:
11. Machine code for function testPR4185b: Post SSA Constant Pool: cp#0: 1.000000e+06, align=4
  
  BB#0: derived from LLVM BB %return LD_F32m %noreg, 1, %noreg, <cp#0>, %noreg, %FPSW<imp-def,dead>; mem:LD4[ConstantPool] INLINEASM <es:fistl $0> [sideeffect] [attdialect], $0:[reguse], %ST0 >>>>>>>>???ST_FPrr %ST0, %FPSW<imp-def> >>>>>>>>???LD_F0 %FPSW<imp-def> INLINEASM <es:fistpl $0> [sideeffect] [attdialect], $0:[reguse], %ST0, $1:[clobber], %ST0<earlyclobber,imp-def> RETL
12. End machine code for function testPR4185b.

I'm looking for why "Prologue/Epilogue Insertion" include
ST_FPrr %ST0, %FPSW<imp-def>
LD_F0 %FPSW<imp-def>
asap. For now we have option -cse-ignore-copy with correspondent FIXME.

Diff Detail

Event Timeline

Danil updated this revision to Diff 9907.May 28 2014, 11:33 PM

Danil retitled this revision from to CSE removes COPY..

Danil updated this object.

Danil edited the test plan for this revision. (Show Details)

Danil added a reviewer: atrick.

Danil added a subscriber: Unknown Object (MLST).

Danil updated this object.May 29 2014, 1:30 AM

Hi Danil,

The reason why CSE ignored COPY is PerformTrivialCoalescing.

If COPY insert into HASH and perform PerformTrivialCoalescing: MI->eraseFromParent()

HASH will be broken.

That is not the only reason. With your patch, most of the single used copies will be coalesced by this method. This method does not use the profitability model of the register coalescer and thus different pairs may be coalesced.
The problem is that this may create over constrained, in terms of register allocation, live-ranges (since they are longer) and may produce more spill code or expensive, not coalescable, copies.

At the very least, you will have to provide performance numbers for your patch.

That said, if we switch the default of cse-ignore-copies to true (instead of false in your current patch), your patch shouldn’t change the current behavior AFAICT. (A diff of the llvm-testsuite binaires with and without your patch may be a good check.)
That way, the patch could be more easily tested by the community and you could already use it for your target.

How does that sound?

cross-regclass copy:

The peephole optimizer has an optimization that rewrites cross-register bank copies. It does not know how to handle your case, but it could be extended to do that.

subreg copy

Your patch does not handle the subreg copy per say, but the scheme seems indeed natural. Maybe add a comment about that when we abort on subregs in PerformTrivialCoalescing.
BTW, you do not need to modify the test case.

problem with CodeGen/X86/inline-asm-fpstack.ll:

This is another argument why we should disable that optimization by default. Until we understand what is going on, we do not know the impact of this.

Finally, please run clang-format. You have some weird spacing and some (at least one) ^M characters.

Thanks,
Quentin

You're proposing removing cross-class copies, which is not necessarilly a good idea. I expect that if we have a cross-class copy in MI then it's there for a reason. Note that we should remove cross class copies like this:

r1 = a1
a2 = r1

I don't think we're handling the above case yet but we should fix that. Maybe as part of CSE or maybe earlier. Quentin, do you know if we have a bug on this?

Regarding subregister copies, I don't think this is a good fix and I think the FIXME is still relevant. This fix happens to work when the use of the subreg copies gets CSE'd. But what if it doesn't. Then we have a subreg copy with multiple uses and a large live range. Subreg copies are conceptually part of the use's operand. We should really eliminate them completely, but if we can't then the should be single-use copies with minimal live ranges so that they have the same liveness as they would being part of the operand.

Quentin understands these issue better, so I'll defer to him. I would not oppose a subtarget option to allow cse'ing copies until we have a better way to handle your cases (-machine-cse-copies). However, when the flag is disabled, the logic should remain as it was, and the FIXME should not be removed. Also please make sure the cse-add-with-overflow.ll test still tests for subreg copy coalescing, not subreg copy CSE as you have done.

This revision now requires changes to proceed.May 29 2014, 3:36 PM

Revision Contents

Path

Size

lib/

CodeGen/

	MachineCSE.cpp
	MachineCSE.cpp (revision 209795)

107 lines

test/

CodeGen/

ARM/

	atomic-64bit.ll
	atomic-64bit.ll (revision 209795)

4 lines

	debug-info-branch-folding.ll
	debug-info-branch-folding.ll (revision 209795)

6 lines

X86/

	cse-add-with-overflow.ll
	cse-add-with-overflow.ll (revision 209795)

12 lines

	inline-asm-fpstack.ll
	inline-asm-fpstack.ll (revision 209795)

2 lines

Diff 9907

lib/CodeGen/MachineCSE.cpp

Context not available.
	#include "llvm/CodeGen/MachineDominators.h"	#include "llvm/CodeGen/MachineDominators.h"
	#include "llvm/CodeGen/MachineInstr.h"	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"	#include "llvm/CodeGen/MachineRegisterInfo.h"
		#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"	#include "llvm/Support/Debug.h"
	#include "llvm/Support/RecyclingAllocator.h"	#include "llvm/Support/RecyclingAllocator.h"
	#include "llvm/Target/TargetInstrInfo.h"	#include "llvm/Target/TargetInstrInfo.h"
Context not available.
	"Number of cross-MBB physreg referencing CS eliminated");	"Number of cross-MBB physreg referencing CS eliminated");
	STATISTIC(NumCommutes, "Number of copies coalesced after commuting");	STATISTIC(NumCommutes, "Number of copies coalesced after commuting");

		//FIXME remove this after fix CodeGen/X86/inline-asm-fpstack.ll
		static cl::opt<bool>
		CSEIgnoreCopy("cse-ignore-copy", cl::init(false), cl::Hidden,
		cl::desc("ignore copy"));

	namespace {	namespace {
	class MachineCSE : public MachineFunctionPass {	class MachineCSE : public MachineFunctionPass {
	const TargetInstrInfo *TII;	const TargetInstrInfo *TII;
Context not available.

	bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,	bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,
	MachineBasicBlock *MBB) {	MachineBasicBlock *MBB) {
	bool Changed = false;	if (!MI->isCopy())
	for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {	return false;
	MachineOperand &MO = MI->getOperand(i);	unsigned SrcReg = MI->getOperand(1).getReg();
	if (!MO.isReg() \|\| !MO.isUse())	if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
	continue;	return false;
	unsigned Reg = MO.getReg();	unsigned DstReg = MI->getOperand(0).getReg();
	if (!TargetRegisterInfo::isVirtualRegister(Reg))	if (!TargetRegisterInfo::isVirtualRegister(DstReg))
	continue;	return false;
	if (!MRI->hasOneNonDBGUse(Reg))	if (!MRI->hasOneNonDBGUse(DstReg))
	// Only coalesce single use copies. This ensure the copy will be	// Only coalesce single use copies. This ensure the copy will be
	// deleted.	// deleted.
	continue;	return false;
	MachineInstr *DefMI = MRI->getVRegDef(Reg);	MachineInstr *UseMI = nullptr;
	if (!DefMI->isCopy())	MachineOperand *UseMO = nullptr;
	continue;	bool OnlyOneUse = true;
	unsigned SrcReg = DefMI->getOperand(1).getReg();	for (MachineOperand &MO : MRI->use_nodbg_operands(DstReg)) {
	if (!TargetRegisterInfo::isVirtualRegister(SrcReg))	assert(OnlyOneUse && "only one use should be...");
	continue;	OnlyOneUse = false;
	if (DefMI->getOperand(0).getSubReg())	UseMO = &MO;
	continue;	UseMI = UseMO->getParent();
	// FIXME: We should trivially coalesce subregister copies to expose CSE	if (!isCSECandidate(UseMI))
	// opportunities on instructions with truncated operands (see	return false;
	// cse-add-with-overflow.ll). This can be done here as follows:
	// if (SrcSubReg)
	// RC = TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), RC,
	// SrcSubReg);
	// MO.substVirtReg(SrcReg, SrcSubReg, *TRI);
	//
	// The 2-addr pass has been updated to handle coalesced subregs. However,
	// some machine-specific code still can't handle it.
	// To handle it properly we also need a way find a constrained subregister
	// class given a super-reg class and subreg index.
	if (DefMI->getOperand(1).getSubReg())
	continue;
	const TargetRegisterClass *RC = MRI->getRegClass(Reg);
	if (!MRI->constrainRegClass(SrcReg, RC))
	continue;
	DEBUG(dbgs() << "Coalescing: " << *DefMI);
	DEBUG(dbgs() << "*** to: " << *MI);
	MO.setReg(SrcReg);
	MRI->clearKillFlags(SrcReg);
	DefMI->eraseFromParent();
	++NumCoalesces;
	Changed = true;
	}	}
		if (MI->getOperand(0).getSubReg() \|\|MI->getOperand(1).getSubReg())
	return Changed;	return false;
		const TargetRegisterClass *RC = MRI->getRegClass(DstReg);
		if (!MRI->constrainRegClass(SrcReg, RC))
		return false;
		DEBUG(dbgs() << "Coalescing: " << *MI);
		DEBUG(dbgs() << "*** to: " << *UseMI);
		UseMO->setReg(SrcReg);
		MRI->clearKillFlags(SrcReg);
		MI->eraseFromParent();
		++NumCoalesces;
		return true;
	}	}

	bool	bool
Context not available.
	MI->isInlineAsm() \|\| MI->isDebugValue())	MI->isInlineAsm() \|\| MI->isDebugValue())
	return false;	return false;

	// Ignore copies.	// Ignore copies.
	if (MI->isCopyLike())	if (CSEIgnoreCopy && MI->isCopyLike())
	return false;	return false;

	// Ignore stuff that we obviously can't move.	// Ignore stuff that we obviously can't move.
Context not available.
	MachineInstr MI = &I;	MachineInstr MI = &I;
	++I;	++I;

		if (PerformTrivialCoalescing(MI, MBB)) {
		Changed = true;
		continue;
		}

	if (!isCSECandidate(MI))	if (!isCSECandidate(MI))
	continue;	continue;

	bool FoundCSE = VNT.count(MI);	bool FoundCSE = VNT.count(MI);
	if (!FoundCSE) {
	// Look for trivial copy coalescing opportunities.
	if (PerformTrivialCoalescing(MI, MBB)) {
	Changed = true;

	// After coalescing MI itself may become a copy.
	if (MI->isCopyLike())
	continue;
	FoundCSE = VNT.count(MI);
	}
	}

	// Commute commutable instructions.	// Commute commutable instructions.
	bool Commuted = false;	bool Commuted = false;
	if (!FoundCSE && MI->isCommutable()) {	if (!FoundCSE && MI->isCommutable()) {
Context not available.

test/CodeGen/ARM/atomic-64bit.ll

Context not available.
	; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]	; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
	; CHECK-THUMB-LE-DAG: eor.w [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2	; CHECK-THUMB-LE-DAG: eor.w [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
	; CHECK-THUMB-LE-DAG: eor.w [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3	; CHECK-THUMB-LE-DAG: eor.w [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
	; CHECK-THUMB-BE-DAG: eor.w [[MISMATCH_HI:[a-z0-9]+]], [[REG1]]	; CHECK-THUMB-BE-DAG: eor.w [[MISMATCH_HI:[a-z0-9]+]], [[REG1]], r2
	; CHECK-THUMB-BE-DAG: eor.w [[MISMATCH_LO:[a-z0-9]+]], [[REG2]]	; CHECK-THUMB-BE-DAG: eor.w [[MISMATCH_LO:[a-z0-9]+]], [[REG2]], r3
	; CHECK-THUMB: orrs [[MISMATCH_HI]], [[MISMATCH_LO]]	; CHECK-THUMB: orrs [[MISMATCH_HI]], [[MISMATCH_LO]]
	; CHECK-THUMB: bne	; CHECK-THUMB: bne
	; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}	; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
Context not available.

test/CodeGen/ARM/debug-info-branch-folding.ll

Context not available.
	target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"	target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
	target triple = "thumbv7-apple-macosx10.6.7"	target triple = "thumbv7-apple-macosx10.6.7"

	;CHECK: vadd.f32 q4, q8, q8	;CHECK: vadd.f32 q[[REGNO:[0-9]+]], q8, q8
	;CHECK-NEXT: LBB0_1	;CHECK-NEXT: LBB0_1

	;CHECK:@DEBUG_VALUE: x <- Q4{{$}}	;CHECK:@DEBUG_VALUE: x <- Q[[REGNO]]{{$}}
	;CHECK-NEXT:@DEBUG_VALUE: y <- Q4{{$}}	;CHECK-NEXT:@DEBUG_VALUE: y <- Q[[REGNO]]{{$}}


	@.str = external constant [13 x i8]	@.str = external constant [13 x i8]
Context not available.

test/CodeGen/X86/cse-add-with-overflow.ll

	; RUN: llc < %s -mtriple=x86_64-darwin -mcpu=generic \| FileCheck %s	; RUN: llc < %s -mtriple=x86_64-darwin -mcpu=generic \| FileCheck %s
	; XFAIL: *
	; rdar:15661073 simple example of redundant adds	; rdar:15661073 simple example of redundant adds
	;	;
	; MachineCSE should coalesce trivial subregister copies.	; MachineCSE should coalesce trivial subregister copies.
Context not available.
	;	;
	; The extra movl+addl should be removed during MachineCSE.	; The extra movl+addl should be removed during MachineCSE.
	; CHECK-LABEL: redundantadd	; CHECK-LABEL: redundantadd
	; CHECK: cmpq
	; CHECK: movq	; CHECK: movq
	; CHECK-NOT: movl	; CHECK-NOT: movl
	; CHECK: addl	; CHECK: addl
Context not available.
	entry:	entry:
	%tmp8 = load i64* %a0, align 8	%tmp8 = load i64* %a0, align 8
	%tmp12 = load i64* %a1, align 8	%tmp12 = load i64* %a1, align 8
	%tmp13 = icmp ult i64 %tmp12, -281474976710656
	br i1 %tmp13, label %exit1, label %body

	exit1:
	unreachable

	body:
	%tmp14 = trunc i64 %tmp8 to i32	%tmp14 = trunc i64 %tmp8 to i32
	%tmp15 = trunc i64 %tmp12 to i32	%tmp15 = trunc i64 %tmp12 to i32
	%tmp16 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %tmp14, i32 %tmp15)	%tmp16 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %tmp14, i32 %tmp15)
Context not available.
	return:	return:
	%tmp18 = add i64 %tmp12, %tmp8	%tmp18 = add i64 %tmp12, %tmp8
	%tmp19 = and i64 %tmp18, 4294967295	%tmp19 = and i64 %tmp18, 4294967295
	%tmp20 = or i64 %tmp19, -281474976710656	ret i64 %tmp19
	ret i64 %tmp20
	}	}

	declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)	declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
Context not available.

test/CodeGen/X86/inline-asm-fpstack.ll

	; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -no-integrated-as \| FileCheck %s			; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -no-integrated-as -cse-ignore-copy \| FileCheck %s

	; There should be no stack manipulations between the inline asm and ret.			; There should be no stack manipulations between the inline asm and ret.
	; CHECK: test1			; CHECK: test1