Index: include/llvm/Transforms/Utils/CodeExtractor.h =================================================================== --- include/llvm/Transforms/Utils/CodeExtractor.h +++ include/llvm/Transforms/Utils/CodeExtractor.h @@ -25,6 +25,7 @@ class BranchProbabilityInfo; class DominatorTree; class Function; + class Instruction; class Loop; class Module; class RegionNode; @@ -103,7 +104,8 @@ /// a code sequence, that sequence is modified, including changing these /// sets, before extraction occurs. These modifications won't have any /// significant impact on the cost however. - void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs) const; + void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, + ValueSet &Allocas) const; private: void severSplitPHINodes(BasicBlock *&Header); Index: lib/Transforms/Utils/CodeExtractor.cpp =================================================================== --- lib/Transforms/Utils/CodeExtractor.cpp +++ lib/Transforms/Utils/CodeExtractor.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" @@ -141,16 +142,67 @@ return false; } -void CodeExtractor::findInputsOutputs(ValueSet &Inputs, - ValueSet &Outputs) const { +void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, + ValueSet &SinkCands) const { + for (BasicBlock *BB : Blocks) { + for (Instruction &II : *BB) { + IntrinsicInst *LifeStart = dyn_cast(&II); + if (!LifeStart) + continue; + if (LifeStart->getIntrinsicID() != Intrinsic::lifetime_start) + continue; + Value *LocalWithBitcast = LifeStart->getOperand(1); + // The matching lifetime_end must also be in the region: + bool LifeEndFound = false; + for (User *U : LocalWithBitcast->users()) { + // Found unknown uses outside the region, bail. + if (!definedInRegion(Blocks, U)) { + LifeEndFound = false; + break; + } + IntrinsicInst *LifeEnd = dyn_cast(U); + if (LifeEnd && LifeEnd->getIntrinsicID() == Intrinsic::lifetime_end) + LifeEndFound = true; + } + if (!LifeEndFound) + continue; + + Value *Local = LocalWithBitcast->stripPointerCasts(); + auto *AI = dyn_cast(Local); + if (!AI) + continue; + + if (definedInRegion(Blocks, AI)) + continue; + + bool FoundUnknownUse = false; + if (AI != LocalWithBitcast) + for (User *U : AI->users()) { + if (U != LocalWithBitcast && !definedInRegion(Blocks, U)) { + FoundUnknownUse = true; + break; + } + } + + if (FoundUnknownUse) + continue; + + if (LocalWithBitcast != AI && !definedInRegion(Blocks, LocalWithBitcast)) + SinkCands.insert(LocalWithBitcast); + SinkCands.insert(AI); + } + } + for (BasicBlock *BB : Blocks) { // If a used value is defined outside the region, it's an input. If an // instruction is used outside the region, it's an output. for (Instruction &II : *BB) { for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE; - ++OI) - if (definedInCaller(Blocks, *OI)) - Inputs.insert(*OI); + ++OI) { + Value *V = *OI; + if (!SinkCands.count(V) && definedInCaller(Blocks, V)) + Inputs.insert(V); + } for (User *U : II.users()) if (!definedInRegion(Blocks, U)) { @@ -718,7 +770,7 @@ if (!isEligible()) return nullptr; - ValueSet inputs, outputs; + ValueSet inputs, outputs, SinkingCands; // Assumption: this is a single-entry code region, and the header is the first // block in the region. @@ -758,7 +810,12 @@ newFuncRoot->getInstList().push_back(BranchInst::Create(header)); // Find inputs to, outputs from the code region. - findInputsOutputs(inputs, outputs); + findInputsOutputs(inputs, outputs, SinkingCands); + + // Now sink all instructions which only have non-phi uses inside the region + for (auto *II : SinkingCands) + dyn_cast(II)->moveBefore(*header, + header->getFirstInsertionPt()); // Calculate the exit blocks for the extracted region and the total exit // weights for each of those blocks. Index: test/Transforms/CodeExtractor/PartialInlineAlloca.ll =================================================================== --- test/Transforms/CodeExtractor/PartialInlineAlloca.ll +++ test/Transforms/CodeExtractor/PartialInlineAlloca.ll @@ -0,0 +1,68 @@ + +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + ; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +; Function Attrs: nounwind uwtable +define i32 @callee_sinkable_bitcast(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_sinkable_bitcast.{{[0-9]}} +; CHECK: alloca +; CHECK-NEXT: bitcast +; CHECK-NEXT: call void @llvm.lifetime +bb: + %tmp = alloca %"class.base", align 4 + %tmp1 = bitcast %"class.base"* %tmp to i8* + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1) #2 + %tmp11 = bitcast %"class.base"* %tmp to i32* + store i32 %tmp3, i32* %tmp11, align 4, !tbaa !2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + call void @bar(i32* nonnull %tmp11) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + ret i32 %tmp7 +} + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_sinkable_bitcast(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + Index: test/Transforms/CodeExtractor/PartialInlineAlloca2.ll =================================================================== --- test/Transforms/CodeExtractor/PartialInlineAlloca2.ll +++ test/Transforms/CodeExtractor/PartialInlineAlloca2.ll @@ -0,0 +1,65 @@ +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +define i32 @callee_no_bitcast(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_no_bitcast.{{[0-9]}} +; CHECK: alloca +; CHECK-NEXT: call void @llvm.lifetime +bb: + %tmp = alloca i8, align 4 + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp) #2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + %tmp11 = bitcast i8 * %tmp to i32* + call void @bar(i32* nonnull %tmp11) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + ret i32 %tmp7 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_no_bitcast(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + + Index: test/Transforms/CodeExtractor/PartialInlineAlloca4.ll =================================================================== --- test/Transforms/CodeExtractor/PartialInlineAlloca4.ll +++ test/Transforms/CodeExtractor/PartialInlineAlloca4.ll @@ -0,0 +1,67 @@ +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +define i32 @callee_unknown_use1(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_unknown_use1.{{[0-9]}} +; CHECK-NOT: alloca +; CHECK: call void @llvm.lifetime +bb: + %tmp = alloca i8, align 4 + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp) #2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + %tmp11 = bitcast i8* %tmp to i32* + call void @bar(i32* nonnull %tmp11) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + %tmp1 = bitcast i8* %tmp to i32* + ret i32 %tmp7 +} + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_unknown_use1(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + + Index: test/Transforms/CodeExtractor/PartialInlineAlloca5.ll =================================================================== --- test/Transforms/CodeExtractor/PartialInlineAlloca5.ll +++ test/Transforms/CodeExtractor/PartialInlineAlloca5.ll @@ -0,0 +1,67 @@ +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +define i32 @callee_unknown_use2(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_unknown_use2.{{[0-9]}} +; CHECK-NOT: alloca +; CHECK: call void @llvm.lifetime +bb: + %tmp = alloca i32, align 4 + %tmp1 = bitcast i32* %tmp to i8* + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1) #2 + store i32 %tmp3, i32* %tmp, align 4, !tbaa !2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + call void @bar(i32* nonnull %tmp) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + %tmp10 = bitcast i8* %tmp1 to i32* + ret i32 %tmp7 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_unknown_use2(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + +