Index: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp @@ -1760,6 +1760,18 @@ } } + // If we have a cold call site, try to sink addressing computation into the + // cold block. This interacts with our handling for loads and stores to + // ensure that we can fold all uses of a potential addressing computation + // into their uses. TODO: generalize this to work over profiling data + if (!OptSize && CI->hasFnAttr(Attribute::Cold)) + for (auto &Arg : CI->arg_operands()) { + if (!Arg->getType()->isPointerTy()) + continue; + unsigned AS = Arg->getType()->getPointerAddressSpace(); + return optimizeMemoryInst(CI, Arg, Arg->getType(), AS); + } + IntrinsicInst *II = dyn_cast(CI); if (II) { switch (II->getIntrinsicID()) { @@ -3443,6 +3455,8 @@ if (!MightBeFoldableInst(I)) return true; + const bool OptSize = I->getFunction()->optForSize(); + // Loop over all the uses, recursively processing them. for (Use &U : I->uses()) { Instruction *UserI = cast(U.getUser()); @@ -3460,6 +3474,11 @@ } if (CallInst *CI = dyn_cast(UserI)) { + // If this is a cold call, we can sink the addressing calculation into + // the cold path. See optimizeCallInst + if (!OptSize && CI->hasFnAttr(Attribute::Cold)) + continue; + InlineAsm *IA = dyn_cast(CI->getCalledValue()); if (!IA) return true; @@ -3551,10 +3570,10 @@ if (!BaseReg && !ScaledReg) return true; - // If all uses of this instruction are ultimately load/store/inlineasm's, - // check to see if their addressing modes will include this instruction. If - // so, we can fold it into all uses, so it doesn't matter if it has multiple - // uses. + // If all uses of this instruction can have the address mode sunk into them, + // we can remove the addressing mode and effectively trade one live register + // for another (at worst.) In this context, folding an addressing mode into + // the use is just a particularly nice way of sinking it. SmallVector, 16> MemoryUses; SmallPtrSet ConsideredInsts; if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM)) @@ -3562,8 +3581,13 @@ // Now that we know that all uses of this instruction are part of a chain of // computation involving only operations that could theoretically be folded - // into a memory use, loop over each of these uses and see if they could - // *actually* fold the instruction. + // into a memory use, loop over each of these memory operation uses and see + // if they could *actually* fold the instruction. The assumption is that + // addressing modes are cheap and that duplicating the computation involved + // many times is worthwhile, even on a fastpath. For sinking candidates + // (i.e. cold call sites), this serves as a way to prevent excessive code + // growth since most architectures have some reasonable small and fast way to + // compute an effective address. (i.e LEA on x86) SmallVector MatchedAddrModeInsts; for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { Instruction *User = MemoryUses[i].first; @@ -3617,6 +3641,11 @@ return false; } +/// Sink addressing mode computation immediate before MemoryInst if doing so +/// can be done without increasing register pressure. The need for the +/// register pressure constraint means this can end up being an all or nothing +/// decision for all uses of the same addressing computation. +/// /// Load and Store Instructions often have addressing modes that can do /// significant amounts of computation. As such, instruction selection will try /// to get the load or store to do as much computation as possible for the @@ -3624,7 +3653,13 @@ /// such, we sink as much legal addressing mode work into the block as possible. /// /// This method is used to optimize both load/store and inline asms with memory -/// operands. +/// operands. It's also used to sink addressing computations feeding into cold +/// call sites into their (cold) basic block. +/// +/// The motivation for handling sinking into cold blocks is that doing so can +/// both enable other address mode sinking (by satisfying the register pressure +/// constraint above), and reduce register pressure globally (by removing the +/// addressing mode computation from the fast path entirely.). bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy, unsigned AddrSpace) { Value *Repl = Addr; @@ -3663,7 +3698,9 @@ continue; } - // For non-PHIs, determine the addressing mode being computed. + // For non-PHIs, determine the addressing mode being computed. Note that + // the result may differ depending on what other uses our candidate + // addressing instructions might have. SmallVector NewAddrModeInsts; ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM, Index: llvm/trunk/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll =================================================================== --- llvm/trunk/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll +++ llvm/trunk/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll @@ -0,0 +1,196 @@ +; RUN: opt -S -codegenprepare < %s | FileCheck %s + +target datalayout = +"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +target triple = "x86_64-unknown-linux-gnu" + +; Can we sink single addressing mode computation to use? +define void @test1(i1 %cond, i64* %base) { +; CHECK-LABEL: @test1 +; CHECK: add i64 {{.+}}, 40 +entry: + %addr = getelementptr inbounds i64, i64* %base, i64 5 + %casted = bitcast i64* %addr to i32* + br i1 %cond, label %if.then, label %fallthrough + +if.then: + %v = load i32, i32* %casted, align 4 + br label %fallthrough + +fallthrough: + ret void +} + +declare void @foo(i32) + +; Make sure sinking two copies of addressing mode into different blocks works +define void @test2(i1 %cond, i64* %base) { +; CHECK-LABEL: @test2 +entry: + %addr = getelementptr inbounds i64, i64* %base, i64 5 + %casted = bitcast i64* %addr to i32* + br i1 %cond, label %if.then, label %fallthrough + +if.then: +; CHECK-LABEL: if.then: +; CHECK: add i64 {{.+}}, 40 + %v1 = load i32, i32* %casted, align 4 + call void @foo(i32 %v1) + %cmp = icmp eq i32 %v1, 0 + br i1 %cmp, label %next, label %fallthrough + +next: +; CHECK-LABEL: next: +; CHECK: add i64 {{.+}}, 40 + %v2 = load i32, i32* %casted, align 4 + call void @foo(i32 %v2) + br label %fallthrough + +fallthrough: + ret void +} + +; If we have two loads in the same block, only need one copy of addressing mode +; - instruction selection will duplicate if needed +define void @test3(i1 %cond, i64* %base) { +; CHECK-LABEL: @test3 +entry: + %addr = getelementptr inbounds i64, i64* %base, i64 5 + %casted = bitcast i64* %addr to i32* + br i1 %cond, label %if.then, label %fallthrough + +if.then: +; CHECK-LABEL: if.then: +; CHECK: add i64 {{.+}}, 40 + %v1 = load i32, i32* %casted, align 4 + call void @foo(i32 %v1) +; CHECK-NOT: add i64 {{.+}}, 40 + %v2 = load i32, i32* %casted, align 4 + call void @foo(i32 %v2) + br label %fallthrough + +fallthrough: + ret void +} + +; Can we still sink addressing mode if there's a cold use of the +; address itself? +define void @test4(i1 %cond, i64* %base) { +; CHECK-LABEL: @test4 +entry: + %addr = getelementptr inbounds i64, i64* %base, i64 5 + %casted = bitcast i64* %addr to i32* + br i1 %cond, label %if.then, label %fallthrough + +if.then: +; CHECK-LABEL: if.then: +; CHECK: add i64 {{.+}}, 40 + %v1 = load i32, i32* %casted, align 4 + call void @foo(i32 %v1) + %cmp = icmp eq i32 %v1, 0 + br i1 %cmp, label %rare.1, label %fallthrough + +fallthrough: + ret void + +rare.1: +; CHECK-LABEL: rare.1: +; CHECK: add i64 {{.+}}, 40 + call void @slowpath(i32 %v1, i32* %casted) cold + br label %fallthrough +} + +; Negative test - don't want to duplicate addressing into hot path +define void @test5(i1 %cond, i64* %base) { +; CHECK-LABEL: @test5 +entry: +; CHECK: %addr = getelementptr + %addr = getelementptr inbounds i64, i64* %base, i64 5 + %casted = bitcast i64* %addr to i32* + br i1 %cond, label %if.then, label %fallthrough + +if.then: +; CHECK-LABEL: if.then: +; CHECK-NOT: add i64 {{.+}}, 40 + %v1 = load i32, i32* %casted, align 4 + call void @foo(i32 %v1) + %cmp = icmp eq i32 %v1, 0 + br i1 %cmp, label %rare.1, label %fallthrough + +fallthrough: + ret void + +rare.1: + call void @slowpath(i32 %v1, i32* %casted) ;; NOT COLD + br label %fallthrough +} + +; Negative test - opt for size +define void @test6(i1 %cond, i64* %base) minsize { +; CHECK-LABEL: @test6 +entry: +; CHECK: %addr = getelementptr + %addr = getelementptr inbounds i64, i64* %base, i64 5 + %casted = bitcast i64* %addr to i32* + br i1 %cond, label %if.then, label %fallthrough + +if.then: +; CHECK-LABEL: if.then: +; CHECK-NOT: add i64 {{.+}}, 40 + %v1 = load i32, i32* %casted, align 4 + call void @foo(i32 %v1) + %cmp = icmp eq i32 %v1, 0 + br i1 %cmp, label %rare.1, label %fallthrough + +fallthrough: + ret void + +rare.1: + call void @slowpath(i32 %v1, i32* %casted) cold + br label %fallthrough +} + + +; Make sure sinking two copies of addressing mode into different blocks works +; when there are cold paths for each. +define void @test7(i1 %cond, i64* %base) { +; CHECK-LABEL: @test7 +entry: + %addr = getelementptr inbounds i64, i64* %base, i64 5 + %casted = bitcast i64* %addr to i32* + br i1 %cond, label %if.then, label %fallthrough + +if.then: +; CHECK-LABEL: if.then: +; CHECK: add i64 {{.+}}, 40 + %v1 = load i32, i32* %casted, align 4 + call void @foo(i32 %v1) + %cmp = icmp eq i32 %v1, 0 + br i1 %cmp, label %rare.1, label %next + +next: +; CHECK-LABEL: next: +; CHECK: add i64 {{.+}}, 40 + %v2 = load i32, i32* %casted, align 4 + call void @foo(i32 %v2) + %cmp2 = icmp eq i32 %v2, 0 + br i1 %cmp2, label %rare.1, label %fallthrough + +fallthrough: + ret void + +rare.1: +; CHECK-LABEL: rare.1: +; CHECK: add i64 {{.+}}, 40 + call void @slowpath(i32 %v1, i32* %casted) cold + br label %next + +rare.2: +; CHECK-LABEL: rare.2: +; CHECK: add i64 {{.+}}, 40 + call void @slowpath(i32 %v2, i32* %casted) cold + br label %fallthrough +} + + +declare void @slowpath(i32, i32*)