Index: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -283,6 +283,82 @@ Segment = CurDAG->getRegister(0, MVT::i32); } + // Utility function to determine whether we should avoid selecting + // immediate forms of instructions for better code size or not. + // At a high level, we'd like to avoid such instructions when + // we have similar constants used within the same basic block + // that can be kept in a register. + // + bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { + uint32_t UseCount = 0; + + // Do not want to hoist if we're not optimizing for size. + // TODO: We'd like to remove this restriction. + // See the comment in X86InstrInfo.td for more info. + if (!OptForSize) + return false; + + // Walk all the users of the immediate. + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { + + SDNode *User = *UI; + + // This user is already selected. Count it as a legitimate use and + // move on. + if (User->isMachineOpcode()) { + UseCount++; + continue; + } + + // We want to count stores of immediates as real uses. + if (User->getOpcode() == ISD::STORE && + User->getOperand(1).getNode() == N) { + UseCount++; + continue; + } + + // We don't currently match users that have > 2 operands (except + // for stores, which are handled above) + // Those instruction won't match in ISEL, for now, and would + // be counted incorrectly. + // This may change in the future as we add additional instruction + // types. + if (User->getNumOperands() != 2) + continue; + + // Immediates that are used for offsets as part of stack + // manipulation should be left alone. These are typically + // used to indicate SP offsets for argument passing and + // will get pulled into stores/pushes (implicitly). + if (User->getOpcode() == X86ISD::ADD || + User->getOpcode() == ISD::ADD || + User->getOpcode() == X86ISD::SUB || + User->getOpcode() == ISD::SUB) { + + // Find the other operand of the add/sub. + SDValue OtherOp = User->getOperand(0); + if (OtherOp.getNode() == N) + OtherOp = User->getOperand(1); + + // Don't count if the other operand is SP. + RegisterSDNode *RegNode; + if (OtherOp->getOpcode() == ISD::CopyFromReg && + (RegNode = dyn_cast_or_null( + OtherOp->getOperand(1).getNode()))) + if ((RegNode->getReg() == X86::ESP) || + (RegNode->getReg() == X86::RSP)) + continue; + } + + // ... otherwise, count this and move on. + UseCount++; + } + + // If we have more than 1 use, then recommend for hoisting. + return (UseCount > 1); + } + /// getI8Imm - Return a target constant with the specified value, of type /// i8. inline SDValue getI8Imm(unsigned Imm, SDLoc DL) { Index: llvm/trunk/lib/Target/X86/X86InstrArithmetic.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrArithmetic.td +++ llvm/trunk/lib/Target/X86/X86InstrArithmetic.td @@ -615,14 +615,14 @@ def invalid_node : SDNode<"<>", SDTIntLeaf,[],"<>">; -def Xi8 : X86TypeInfo; def Xi16 : X86TypeInfo; def Xi32 : X86TypeInfo; def Xi64 : X86TypeInfo; def i64immSExt8 : ImmLeaf; +// If we have multiple users of an immediate, it's much smaller to reuse +// the register, rather than encode the immediate in every instruction. +// This has the risk of increasing register pressure from stretched live +// ranges, however, the immediates should be trivial to rematerialize by +// the RA in the event of high register pressure. +// TODO : This is currently enabled for stores and binary ops. There are more +// cases for which this can be enabled, though this catches the bulk of the +// issues. +// TODO2 : This should really also be enabled under O2, but there's currently +// an issue with RA where we don't pull the constants into their users +// when we rematerialize them. I'll follow-up on enabling O2 after we fix that +// issue. +// TODO3 : This is currently limited to single basic blocks (DAG creation +// pulls block immediates to the top and merges them if necessary). +// Eventually, it would be nice to allow ConstantHoisting to merge constants +// globally for potentially added savings. +// +def imm8_su : PatLeaf<(i8 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm16_su : PatLeaf<(i16 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm32_su : PatLeaf<(i32 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + +def i16immSExt8_su : PatLeaf<(i16immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i32immSExt8_su : PatLeaf<(i32immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + def i64immSExt32 : ImmLeaf; @@ -1283,13 +1317,13 @@ let SchedRW = [WriteStore] in { def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", - [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>; + [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>; def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", - [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16; + [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16; def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; + [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>; Index: llvm/trunk/test/CodeGen/X86/immediate_merging.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/immediate_merging.ll +++ llvm/trunk/test/CodeGen/X86/immediate_merging.ll @@ -0,0 +1,82 @@ +; RUN: llc -o - -mtriple=i386-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -o - -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +@a = common global i32 0, align 4 +@b = common global i32 0, align 4 +@c = common global i32 0, align 4 +@e = common global i32 0, align 4 +@x = common global i32 0, align 4 +@f = common global i32 0, align 4 +@h = common global i32 0, align 4 +@i = common global i32 0, align 4 + +; Test -Os to make sure immediates with multiple users don't get pulled in to +; instructions. +define i32 @foo() optsize { +; CHECK-LABEL: foo: +; CHECK: movl $1234, [[R1:%[a-z]+]] +; CHECK-NOT: movl $1234, a +; CHECK-NOT: movl $1234, b +; CHECK-NOT: movl $12, c +; CHECK-NOT: cmpl $12, e +; CHECK: movl [[R1]], a +; CHECK: movl [[R1]], b + +entry: + store i32 1234, i32* @a + store i32 1234, i32* @b + store i32 12, i32* @c + %0 = load i32, i32* @e + %cmp = icmp eq i32 %0, 12 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store i32 1, i32* @x + br label %if.end + +; New block.. Make sure 1234 isn't live across basic blocks from before. +; CHECK: movl $1234, f +; CHECK: movl $555, [[R3:%[a-z]+]] +; CHECK-NOT: movl $555, h +; CHECK-NOT: addl $555, i +; CHECK: movl [[R3]], h +; CHECK: addl [[R3]], i + +if.end: ; preds = %if.then, %entry + store i32 1234, i32* @f + store i32 555, i32* @h + %1 = load i32, i32* @i + %add1 = add nsw i32 %1, 555 + store i32 %add1, i32* @i + ret i32 0 +} + +; Test -O2 to make sure that all immediates get pulled in to their users. +define i32 @foo2() { +; CHECK-LABEL: foo2: +; CHECK: movl $1234, a +; CHECK: movl $1234, b + +entry: + store i32 1234, i32* @a + store i32 1234, i32* @b + + ret i32 0 +} + +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #1 + +@AA = common global [100 x i8] zeroinitializer, align 1 + +; memset gets lowered in DAG. Constant merging should hoist all the +; immediates used to store to the individual memory locations. Make +; sure we don't directly store the immediates. +define void @foomemset() optsize { +; CHECK-LABEL: foomemset: +; CHECK-NOT: movl ${{.*}}, AA +; CHECK: mov{{l|q}} %{{e|r}}ax, AA + +entry: + call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i32 1, i1 false) + ret void +} Index: llvm/trunk/test/CodeGen/X86/remat-invalid-liveness.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/remat-invalid-liveness.ll +++ llvm/trunk/test/CodeGen/X86/remat-invalid-liveness.ll @@ -1,85 +0,0 @@ -; RUN: llc %s -mcpu=core2 -o - | FileCheck %s -; This test was failing while tracking the liveness in the register scavenger -; during the branching folding pass. The allocation of the subregisters was -; incorrect. -; I.e., the faulty pattern looked like: -; CH = movb 64 -; ECX = movl 3 <- CH was killed here. -; CH = subb CH, ... -; -; This reduced test case triggers the crash before the fix, but does not -; strictly speaking check that the resulting code is correct. -; To check that the code is actually correct we would need to check the -; liveness of the produced code. -; -; Currently, we check that after ECX = movl 3, we do not have subb CH, -; whereas CH could have been redefine in between and that would have been -; totally fine. -; -target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" -target triple = "i386-apple-macosx10.9" - -%struct.A = type { %struct.B, %struct.C, %struct.D*, [1 x i8*] } -%struct.B = type { i32, [4 x i8] } -%struct.C = type { i128 } -%struct.D = type { {}*, [0 x i32] } -%union.E = type { i32 } - -; CHECK-LABEL: __XXX1: -; CHECK: movl $3, %ecx -; CHECK-NOT: subb %{{[a-z]+}}, %ch -; Function Attrs: nounwind optsize ssp -define fastcc void @__XXX1(%struct.A* %ht) #0 { -entry: - %const72 = bitcast i128 72 to i128 - %const3 = bitcast i128 3 to i128 - switch i32 undef, label %if.end196 [ - i32 1, label %sw.bb.i - i32 3, label %sw.bb2.i - ] - -sw.bb.i: ; preds = %entry - %call.i.i.i = tail call i32 undef(%struct.A* %ht, i8 zeroext 22, i32 undef, i32 0, %struct.D* undef) - %bf.load.i.i = load i128, i128* undef, align 4 - %bf.lshr.i.i = lshr i128 %bf.load.i.i, %const72 - %shl1.i.i = shl nuw nsw i128 %bf.lshr.i.i, 8 - %shl.i.i = trunc i128 %shl1.i.i to i32 - br i1 undef, label %cond.false10.i.i, label %__XXX2.exit.i.i - -__XXX2.exit.i.i: ; preds = %sw.bb.i - %extract11.i.i.i = lshr i128 %bf.load.i.i, %const3 - %extract.t12.i.i.i = trunc i128 %extract11.i.i.i to i32 - %bf.cast7.i.i.i = and i32 %extract.t12.i.i.i, 3 - %arrayidx.i.i.i = getelementptr inbounds %struct.A, %struct.A* %ht, i32 0, i32 3, i32 %bf.cast7.i.i.i - br label %cond.end12.i.i - -cond.false10.i.i: ; preds = %sw.bb.i - %arrayidx.i6.i.i = getelementptr inbounds %struct.A, %struct.A* %ht, i32 0, i32 3, i32 0 - br label %cond.end12.i.i - -cond.end12.i.i: ; preds = %cond.false10.i.i, %__XXX2.exit.i.i - %.sink.in.i.i = phi i8** [ %arrayidx.i.i.i, %__XXX2.exit.i.i ], [ %arrayidx.i6.i.i, %cond.false10.i.i ] - %.sink.i.i = load i8*, i8** %.sink.in.i.i, align 4 - %tmp = bitcast i8* %.sink.i.i to %union.E* - br i1 undef, label %for.body.i.i, label %if.end196 - -for.body.i.i: ; preds = %for.body.i.i, %cond.end12.i.i - %weak.i.i = getelementptr inbounds %union.E, %union.E* %tmp, i32 undef, i32 0 - %tmp1 = load i32, i32* %weak.i.i, align 4 - %cmp36.i.i = icmp ne i32 %tmp1, %shl.i.i - %or.cond = and i1 %cmp36.i.i, false - br i1 %or.cond, label %for.body.i.i, label %if.end196 - -sw.bb2.i: ; preds = %entry - %bf.lshr.i85.i = lshr i128 undef, %const72 - br i1 undef, label %if.end196, label %__XXX2.exit.i95.i - -__XXX2.exit.i95.i: ; preds = %sw.bb2.i - %extract11.i.i91.i = lshr i128 undef, %const3 - br label %if.end196 - -if.end196: ; preds = %__XXX2.exit.i95.i, %sw.bb2.i, %for.body.i.i, %cond.end12.i.i, %entry - ret void -} - -attributes #0 = { nounwind optsize ssp "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }