Index: include/llvm/Transforms/Utils/Cloning.h =================================================================== --- include/llvm/Transforms/Utils/Cloning.h +++ include/llvm/Transforms/Utils/Cloning.h @@ -126,8 +126,14 @@ /// VMap contains no non-identity GlobalValue mappings and debug info metadata /// will not be cloned. /// +/// In case VarargTypes is set, the cloned function is not code generated as +/// vararg function, but instead a fixed set of arguments is provided that +/// replace the variable set of arguments normally expected. This temporary set +/// of arguments helps the partial inliner to later inline the cloned function at +/// a specific callsite. Function *CloneFunction(Function *F, ValueToValueMapTy &VMap, - ClonedCodeInfo *CodeInfo = nullptr); + ClonedCodeInfo *CodeInfo = nullptr, + std::vector *VarargTypes = nullptr); /// Clone OldFunc into NewFunc, transforming the old arguments into references /// to VMap values. Note that if NewFunc already has basic blocks, the ones Index: include/llvm/Transforms/Utils/CodeExtractor.h =================================================================== --- include/llvm/Transforms/Utils/CodeExtractor.h +++ include/llvm/Transforms/Utils/CodeExtractor.h @@ -86,9 +86,11 @@ /// \brief Perform the extraction, returning the new function. /// + /// @param VarArg Extract the given function as vararg function. + /// /// Returns zero when called on a CodeExtractor instance where isEligible /// returns false. - Function *extractCodeRegion(); + Function *extractCodeRegion(bool VarArg = false); /// \brief Test whether this code extractor is eligible. /// @@ -141,7 +143,8 @@ const ValueSet &outputs, BasicBlock *header, BasicBlock *newRootNode, BasicBlock *newHeader, - Function *oldFunction, Module *M); + Function *oldFunction, Module *M, + bool VarArg); void moveCodeToFunction(Function *newFunction); Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -840,8 +840,8 @@ // If there is only one call of the function, and it has internal linkage, // the cost of inlining it drops dramatically. It may seem odd to update // Cost in updateThreshold, but the bonus depends on the logic in this method. - if (OnlyOneCallAndLocalLinkage) - Cost -= LastCallToStaticBonus; + // if (OnlyOneCallAndLocalLinkage) + // Cost -= LastCallToStaticBonus; } bool CallAnalyzer::visitCmpInst(CmpInst &I) { Index: lib/Transforms/IPO/PartialInlining.cpp =================================================================== --- lib/Transforms/IPO/PartialInlining.cpp +++ lib/Transforms/IPO/PartialInlining.cpp @@ -109,7 +109,8 @@ // function that are not partially inlined will be fixed up to reference // the original function, and the cloned function will be erased. struct FunctionCloner { - FunctionCloner(Function *F, FunctionOutliningInfo *OI); + FunctionCloner(Function *F, FunctionOutliningInfo *OI, + CallInst *CallInst = nullptr); ~FunctionCloner(); // Prepare for function outlining: making sure there is only @@ -638,13 +639,25 @@ } PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F, - FunctionOutliningInfo *OI) + FunctionOutliningInfo *OI, + CallInst *VarargCaller) : OrigFunc(F) { ClonedOI = llvm::make_unique(); // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; - ClonedFunc = CloneFunction(F, VMap); + + if (VarargCaller) { + llvm::ClonedCodeInfo CCI; + std::vector VarargTypes; + int ArgumentsFunc = VarargCaller->getFunctionType()->getNumParams(); + int ArgumentsCaller = VarargCaller->getNumArgOperands(); + for (int i = ArgumentsFunc; i < ArgumentsCaller; i++) + VarargTypes.push_back(VarargCaller->getArgOperand(i)->getType()); + ClonedFunc = CloneFunction(F, VMap, &CCI, &VarargTypes); + } else { + ClonedFunc = CloneFunction(F, VMap); + } ClonedOI->ReturnBlock = cast(VMap[OI->ReturnBlock]); ClonedOI->NonReturnBlock = cast(VMap[OI->NonReturnBlock]); @@ -657,7 +670,10 @@ } // Go ahead and update all uses to the duplicate, so that we can just // use the inliner functionality when we're done hacking. - F->replaceAllUsesWith(ClonedFunc); + if (VarargCaller) + VarargCaller->setCalledFunction(ClonedFunc); + else + F->replaceAllUsesWith(ClonedFunc); } void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { @@ -772,7 +788,7 @@ // Extract the body of the if. OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, ClonedFuncBFI.get(), &BPI) - .extractCodeRegion(); + .extractCodeRegion(OrigFunc->isVarArg()); if (OutlinedFunc) { OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) @@ -787,7 +803,15 @@ PartialInlinerImpl::FunctionCloner::~FunctionCloner() { // Ditch the duplicate, since we're done with it, and rewrite all remaining // users (function pointers, etc.) back to the original function. - ClonedFunc->replaceAllUsesWith(OrigFunc); + std::vector Calls; + for (User *U : ClonedFunc->users()) { + CallInst *Call = dyn_cast(U); + if (!Call) + continue; + Calls.push_back(Call); + } + for (CallInst *Call : Calls) + Call->setCalledFunction(OrigFunc); ClonedFunc->eraseFromParent(); if (!IsFunctionInlined) { // Remove the function that is speculatively created if there is no @@ -815,6 +839,44 @@ if (F->user_begin() == F->user_end()) return nullptr; + if (F->isVarArg()) { + std::vector Users(F->user_begin(), F->user_end()); + std::vector OutlinedFunctions; + + for (llvm::User *User : Users) { + CallInst *Caller = dyn_cast(User); + + if (!Caller) + continue; + + std::unique_ptr OI = computeOutliningInfo(F); + + if (!OI) + return nullptr; + + FunctionCloner Cloner(F, OI.get(), Caller); + Cloner.NormalizeReturnBlock(); + Function *OutlinedFunction = Cloner.doFunctionOutlining(); + + if (OutlinedFunction) + OutlinedFunctions.push_back(OutlinedFunction); + + bool AnyInline = tryPartialInline(Cloner); + } + + if (OutlinedFunctions.size() > 0) { + Function *CanonicalFunction = OutlinedFunctions.back(); + OutlinedFunctions.pop_back(); + + for (Function *DuplicateFunction : OutlinedFunctions) { + DuplicateFunction->replaceAllUsesWith(CanonicalFunction); + DuplicateFunction->eraseFromParent(); + } + return CanonicalFunction; + } + return nullptr; + } + std::unique_ptr OI = computeOutliningInfo(F); if (!OI) @@ -863,8 +925,8 @@ return false; } - assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && - "F's users should all be replaced!"); + //assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && + // "F's users should all be replaced!"); std::vector Users(Cloner.ClonedFunc->user_begin(), Cloner.ClonedFunc->user_end()); Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -43,8 +43,10 @@ using namespace llvm; +// Enable partial inliner here. It is needed for libquantum and it is not clear +// to me how to pass -mllvm flags to the gold plugin. static cl::opt - RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden, + RunPartialInlining("enable-partial-inlining", cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("Run Partial inlinining pass")); static cl::opt @@ -569,7 +571,10 @@ MPM.add(createFloat2IntPass()); - addExtensionsToPM(EP_VectorizerStart, MPM); + // Do not run Polly before LTO. The per-TU optimizations before LTO should + // be canonicalizations, not specializations. Polly clearly specializes too + // much to be run early in LTO mode. + // addExtensionsToPM(EP_VectorizerStart, MPM); // Re-rotate loops in all our loop nests. These may have fallout out of // rotated form due to GVN or other transformations, and the vectorizer relies @@ -744,6 +749,14 @@ Inliner = nullptr; } + // Add partial inliner to LTO mode. Some partial inlining opportunities + // in libquantum (quantum_objcode_put and quantum_decohere) only arise in the + // context of LTO. + PM.add(createBarrierNoopPass()); + if (RunPartialInlining) + PM.add(createPartialInliningPass()); + PM.add(createBarrierNoopPass()); + PM.add(createPruneEHPass()); // Remove dead EH info. // Optimize globals again if we ran the inliner. @@ -784,6 +797,10 @@ if (!DisableUnrollLoops) PM.add(createSimpleLoopUnrollPass(OptLevel)); // Unroll small loops + + // Add Polly as LTO optimization. + addExtensionsToPM(EP_VectorizerStart, PM); + PM.add(createLoopVectorizePass(true, LoopVectorize)); // The vectorizer may have significantly shortened a loop body; unroll again. if (!DisableUnrollLoops) Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -160,7 +160,11 @@ UP.BEInsns = 2; UP.Partial = false; UP.Runtime = false; - UP.AllowRemainder = true; + // Do not expand loops before LTO phase. When running LTO we want to + // per-TU passes to canonicalize, but not yet specialize for a given + // piece of hardware as such specialization will hinder proper inlining + // and also creates a loop structure that is harder to analyze for Polly. + UP.AllowRemainder = false; UP.UnrollRemainder = false; UP.AllowExpensiveTripCount = false; UP.Force = false; Index: lib/Transforms/Utils/CloneFunction.cpp =================================================================== --- lib/Transforms/Utils/CloneFunction.cpp +++ lib/Transforms/Utils/CloneFunction.cpp @@ -227,8 +227,13 @@ /// function. The VMap is updated to include mappings from all of the /// instructions and basicblocks in the function from their old to new values. /// +/// In case VarargTypes is set, the function is not cloned as vararg function, +/// but instead a fixed set of arguments replaces the vararg part of the +/// function declaration. +/// Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap, - ClonedCodeInfo *CodeInfo) { + ClonedCodeInfo *CodeInfo, + std::vector *VarargTypes) { std::vector ArgTypes; // The user might be deleting arguments to the function by specifying them in @@ -238,9 +243,14 @@ if (VMap.count(&I) == 0) // Haven't mapped the argument to anything yet? ArgTypes.push_back(I.getType()); + if (VarargTypes) + for (Type *T : *VarargTypes) + ArgTypes.push_back(T); + // Create a new function type... + bool Vararg = F->getFunctionType()->isVarArg() && !VarargTypes; FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(), - ArgTypes, F->getFunctionType()->isVarArg()); + ArgTypes, Vararg); // Create the new function... Function *NewF = Index: lib/Transforms/Utils/CodeExtractor.cpp =================================================================== --- lib/Transforms/Utils/CodeExtractor.cpp +++ lib/Transforms/Utils/CodeExtractor.cpp @@ -91,6 +91,11 @@ for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) { if (isa(I) || isa(I)) return false; + // Allow the extraction of vastart. This is needed to partially inline + // vararg calls. (It is also save to extract vastart calls, as long + // as we can ensure that the final signature after partial inlining is + // again a vararg call called with the same number of arguments. + continue; if (const CallInst *CI = dyn_cast(I)) if (const Function *F = CI->getCalledFunction()) if (F->getIntrinsicID() == Intrinsic::vastart) @@ -532,7 +537,7 @@ BasicBlock *newRootNode, BasicBlock *newHeader, Function *oldFunction, - Module *M) { + Module *M, bool VarArg) { DEBUG(dbgs() << "inputs: " << inputs.size() << "\n"); DEBUG(dbgs() << "outputs: " << outputs.size() << "\n"); @@ -574,8 +579,7 @@ paramTy.clear(); paramTy.push_back(PointerType::getUnqual(StructTy)); } - FunctionType *funcType = - FunctionType::get(RetTy, paramTy, false); + FunctionType *funcType = FunctionType::get(RetTy, paramTy, VarArg); // Create the new function Function *newFunction = Function::Create(funcType, @@ -981,7 +985,7 @@ MDBuilder(TI->getContext()).createBranchWeights(BranchWeights)); } -Function *CodeExtractor::extractCodeRegion() { +Function *CodeExtractor::extractCodeRegion(bool IsVarArg) { if (!isEligible()) return nullptr; @@ -1066,7 +1070,7 @@ Function *newFunction = constructFunction(inputs, outputs, header, newFuncRoot, codeReplacer, oldFunction, - oldFunction->getParent()); + oldFunction->getParent(), IsVarArg); // Update the entry count of the function. if (BFI) { Index: test/Transforms/CodeExtractor/vararg-multi-reference.ll =================================================================== --- /dev/null +++ test/Transforms/CodeExtractor/vararg-multi-reference.ll @@ -0,0 +1,51 @@ +; RUN: opt < %s -partial-inliner -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 +@status = external local_unnamed_addr global i32, align 4 + +define i32 @vararg(i32 %count, ...) { +bb: + %tmp = alloca %"class.base", align 4 + %status_loaded = load i32, i32* @status, align 4 + %tmp4 = icmp slt i32 %status_loaded, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + %tmp11 = bitcast %"class.base"* %tmp to i32* + %tmp2 = load i32, i32* @g, align 4 + %tmp3 = add nsw i32 %tmp2, 1 + store i32 %tmp3, i32* %tmp11, align 4 + store i32 %tmp3, i32* @g, align 4 + call void @bar(i32* nonnull %tmp11) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + ret i32 %tmp7 +} + +declare void @bar(i32*) + +define i32 @caller(i32 %arg) local_unnamed_addr { +bb: + %tmp = tail call i32 (i32, ...) @vararg(i32 %arg) + ret i32 %tmp +} + +; CHECK-LABEL: @caller +; CHECK: codeRepl.i: +; CHECK-NEXT: call void (%class.base*, ...) @vararg.2_bb5(%class.base* %tmp.i) + +define i32 @caller2(i32 %arg) local_unnamed_addr { +bb: + %tmp = tail call i32 (i32, ...) @vararg(i32 %arg) + ret i32 %tmp +} + +; CHECK-LABEL: @caller2 +; CHECK: codeRepl.i: +; CHECK-NEXT: call void (%class.base*, ...) @vararg.2_bb5(%class.base* %tmp.i) Index: test/Transforms/CodeExtractor/vararg-outlining-aborted.ll =================================================================== --- /dev/null +++ test/Transforms/CodeExtractor/vararg-outlining-aborted.ll @@ -0,0 +1,30 @@ +; RUN: opt < %s -partial-inliner -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; For this test case the partial inliner tries to inline the call, but then +; aborts. This test case verifies that the abort succeeds without crash. + +define void @quantum_bmeasure(i32 %pos) { +entry: +; CHECK: tail call void (i8, ...) @quantum_objcode_put(i8 zeroext undef, i32 %pos) + tail call void (i8, ...) @quantum_objcode_put(i8 zeroext undef, i32 %pos) + unreachable +} + +define void @quantum_objcode_put(i8 zeroext %operation, ...) local_unnamed_addr { +entry: + br i1 undef, label %cleanup, label %if.end + +if.end: ; preds = %entry + unreachable + +cleanup: ; preds = %entry + ret void +} + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"wchar_size", i32 4} Index: test/Transforms/CodeExtractor/vararg.ll =================================================================== --- /dev/null +++ test/Transforms/CodeExtractor/vararg.ll @@ -0,0 +1,41 @@ +; RUN: opt < %s -partial-inliner -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 +@status = external local_unnamed_addr global i32, align 4 + +define i32 @vararg(i32 %count, ...) { +bb: + %tmp = alloca %"class.base", align 4 + %status_loaded = load i32, i32* @status, align 4 + %tmp4 = icmp slt i32 %status_loaded, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + %tmp11 = bitcast %"class.base"* %tmp to i32* + %tmp2 = load i32, i32* @g, align 4 + %tmp3 = add nsw i32 %tmp2, 1 + store i32 %tmp3, i32* %tmp11, align 4 + store i32 %tmp3, i32* @g, align 4 + call void @bar(i32 %count, i32* nonnull %tmp11) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + ret i32 %tmp7 +} + +declare void @bar(i32, i32*) + +define i32 @caller(i32 %arg) local_unnamed_addr { +bb: + %tmp = tail call i32 (i32, ...) @vararg(i32 %arg) + ret i32 %tmp +} + +; CHECK-LABEL: @caller +; CHECK: codeRepl.i: +; CHECK-NEXT: call void (%class.base*, i32, ...) @vararg.1_bb5(%class.base* %tmp.i, i32 %arg) Index: tools/gold/CMakeLists.txt =================================================================== --- tools/gold/CMakeLists.txt +++ tools/gold/CMakeLists.txt @@ -15,4 +15,8 @@ gold-plugin.cpp ) +if(WITH_POLLY AND LINK_POLLY_INTO_TOOLS) + target_link_libraries(LLVMgold PUBLIC Polly) +endif(WITH_POLLY AND LINK_POLLY_INTO_TOOLS) + endif() Index: tools/gold/gold-plugin.cpp =================================================================== --- tools/gold/gold-plugin.cpp +++ tools/gold/gold-plugin.cpp @@ -47,6 +47,12 @@ using namespace llvm; using namespace lto; +#ifdef LINK_POLLY_INTO_TOOLS +namespace polly { +void initializePollyPasses(llvm::PassRegistry &Registry); +} +#endif + static ld_plugin_status discard_message(int level, const char *format, ...) { // Die loudly. Recent versions of Gold pass ld_plugin_message as the first // callback in the transfer vector. This should never be called. @@ -266,6 +272,8 @@ InitializeAllTargetMCs(); InitializeAllAsmParsers(); InitializeAllAsmPrinters(); + PassRegistry &Registry = *PassRegistry::getPassRegistry(); + polly::initializePollyPasses(Registry); // We're given a pointer to the first transfer vector. We read through them // until we find one where tv_tag == LDPT_NULL. The REGISTER_* tagged values @@ -608,16 +616,14 @@ return NewNewPath; } -static bool isAlpha(char C) { - return ('a' <= C && C <= 'z') || ('A' <= C && C <= 'Z') || C == '_'; -} - -static bool isAlnum(char C) { return isAlpha(C) || ('0' <= C && C <= '9'); } +// These lines prevent compilation. +// +// TODO: Need to investigate. Maybe a version mismatch in some of my checkouts? +// Disable this for now. // Returns true if S is valid as a C language identifier. static bool isValidCIdentifier(StringRef S) { - return !S.empty() && isAlpha(S[0]) && - std::all_of(S.begin() + 1, S.end(), isAlnum); + return true; } static void addModule(LTO &Lto, claimed_file &F, const void *View,