Index: llvm/trunk/lib/Transforms/IPO/ArgumentPromotion.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/ArgumentPromotion.cpp +++ llvm/trunk/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -566,8 +566,8 @@ /// This method limits promotion of aggregates to only promote up to three /// elements of the aggregate in order to avoid exploding the number of /// arguments passed in. -static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, - AAResults &AAR, unsigned MaxElements) { +static bool isSafeToPromoteArgument(Argument *Arg, bool isByVal, AAResults &AAR, + unsigned MaxElements) { using GEPIndicesSet = std::set; // Quick exit for unused arguments @@ -589,9 +589,6 @@ // // This set will contain all sets of indices that are loaded in the entry // block, and thus are safe to unconditionally load in the caller. - // - // This optimization is also safe for InAlloca parameters, because it verifies - // that the address isn't captured. GEPIndicesSet SafeToUnconditionallyLoad; // This set contains all the sets of indices that we are planning to promote. @@ -599,7 +596,7 @@ GEPIndicesSet ToPromote; // If the pointer is always valid, any load with first index 0 is valid. - if (isByValOrInAlloca || allCallersPassInValidPointerForArgument(Arg)) + if (isByVal || allCallersPassInValidPointerForArgument(Arg)) SafeToUnconditionallyLoad.insert(IndicesVector(1, 0)); // First, iterate the entry block and mark loads of (geps of) arguments as @@ -656,8 +653,7 @@ // TODO: This runs the above loop over and over again for dead GEPs // Couldn't we just do increment the UI iterator earlier and erase the // use? - return isSafeToPromoteArgument(Arg, isByValOrInAlloca, AAR, - MaxElements); + return isSafeToPromoteArgument(Arg, isByVal, AAR, MaxElements); } // Ensure that all of the indices are constants. @@ -856,6 +852,11 @@ if (F->isVarArg()) return nullptr; + // Don't transform functions that receive inallocas, as the transformation may + // not be safe depending on calling convention. + if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca)) + return nullptr; + // First check: see if there are any pointer arguments! If not, quick exit. SmallVector PointerArgs; for (Argument &I : F->args()) @@ -914,8 +915,7 @@ // If this is a byval argument, and if the aggregate type is small, just // pass the elements, which is always safe, if the passed value is densely - // packed or if we can prove the padding bytes are never accessed. This does - // not apply to inalloca. + // packed or if we can prove the padding bytes are never accessed. bool isSafeToPromote = PtrArg->hasByValAttr() && (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg)); @@ -966,7 +966,7 @@ } // Otherwise, see if we can promote the pointer to its value. - if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR, + if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValAttr(), AAR, MaxElements)) ArgsToPromote.insert(PtrArg); } Index: llvm/trunk/lib/Transforms/IPO/GlobalOpt.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/GlobalOpt.cpp +++ llvm/trunk/lib/Transforms/IPO/GlobalOpt.cpp @@ -2090,21 +2090,21 @@ } } -static AttributeList StripNest(LLVMContext &C, AttributeList Attrs) { - // There can be at most one attribute set with a nest attribute. - unsigned NestIndex; - if (Attrs.hasAttrSomewhere(Attribute::Nest, &NestIndex)) - return Attrs.removeAttribute(C, NestIndex, Attribute::Nest); +static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs, + Attribute::AttrKind A) { + unsigned AttrIndex; + if (Attrs.hasAttrSomewhere(A, &AttrIndex)) + return Attrs.removeAttribute(C, AttrIndex, A); return Attrs; } -static void RemoveNestAttribute(Function *F) { - F->setAttributes(StripNest(F->getContext(), F->getAttributes())); +static void RemoveAttribute(Function *F, Attribute::AttrKind A) { + F->setAttributes(StripAttr(F->getContext(), F->getAttributes(), A)); for (User *U : F->users()) { if (isa(U)) continue; CallSite CS(cast(U)); - CS.setAttributes(StripNest(F->getContext(), CS.getAttributes())); + CS.setAttributes(StripAttr(F->getContext(), CS.getAttributes(), A)); } } @@ -2119,13 +2119,6 @@ if (CC != CallingConv::C && CC != CallingConv::X86_ThisCall) return false; - // Don't break the invariant that the inalloca parameter is the only parameter - // passed in memory. - // FIXME: GlobalOpt should remove inalloca when possible and hoist the dynamic - // alloca it uses to the entry block if possible. - if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca)) - return false; - // FIXME: Change CC for the whole chain of musttail calls when possible. // // Can't change CC of the function that either has musttail calls, or is a @@ -2287,6 +2280,17 @@ if (!F->hasLocalLinkage()) continue; + // If we have an inalloca parameter that we can safely remove the + // inalloca attribute from, do so. This unlocks optimizations that + // wouldn't be safe in the presence of inalloca. + // FIXME: We should also hoist alloca affected by this to the entry + // block if possible. + if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) && + !F->hasAddressTaken()) { + RemoveAttribute(F, Attribute::InAlloca); + Changed = true; + } + if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) { NumInternalFunc++; TargetTransformInfo &TTI = GetTTI(*F); @@ -2319,7 +2323,7 @@ !F->hasAddressTaken()) { // The function is not used by a trampoline intrinsic, so it is safe // to remove the 'nest' attribute. - RemoveNestAttribute(F); + RemoveAttribute(F, Attribute::Nest); ++NumNestRemoved; Changed = true; } Index: llvm/trunk/test/Transforms/ArgumentPromotion/X86/thiscall.ll =================================================================== --- llvm/trunk/test/Transforms/ArgumentPromotion/X86/thiscall.ll +++ llvm/trunk/test/Transforms/ArgumentPromotion/X86/thiscall.ll @@ -0,0 +1,38 @@ +; In PR41658, argpromotion put an inalloca in a position that per the +; calling convention is passed in a register. This test verifies that +; we don't do that anymore. It also verifies that the combination of +; globalopt and argpromotion is able to optimize the call safely. +; +; RUN: opt -S -argpromotion %s | FileCheck --check-prefix=THIS %s +; RUN: opt -S -globalopt -argpromotion %s | FileCheck --check-prefix=OPT %s +; THIS: define internal x86_thiscallcc void @internalfun(%struct.a* %this, <{ %struct.a +; OPT: define internal fastcc void @internalfun(<{ %struct.a }>*) + +target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" +target triple = "i386-pc-windows-msvc19.11.0" + +%struct.a = type { i8 } + +define internal x86_thiscallcc void @internalfun(%struct.a* %this, <{ %struct.a }>* inalloca) { +entry: + %a = getelementptr inbounds <{ %struct.a }>, <{ %struct.a }>* %0, i32 0, i32 0 + %argmem = alloca inalloca <{ %struct.a }>, align 4 + %1 = getelementptr inbounds <{ %struct.a }>, <{ %struct.a }>* %argmem, i32 0, i32 0 + %call = call x86_thiscallcc %struct.a* @copy_ctor(%struct.a* %1, %struct.a* dereferenceable(1) %a) + call void @ext(<{ %struct.a }>* inalloca %argmem) + ret void +} + +; This is here to ensure @internalfun is live. +define void @exportedfun(%struct.a* %a) { + %inalloca.save = tail call i8* @llvm.stacksave() + %argmem = alloca inalloca <{ %struct.a }>, align 4 + call x86_thiscallcc void @internalfun(%struct.a* %a, <{ %struct.a }>* inalloca %argmem) + call void @llvm.stackrestore(i8* %inalloca.save) + ret void +} + +declare x86_thiscallcc %struct.a* @copy_ctor(%struct.a* returned, %struct.a* dereferenceable(1)) +declare void @ext(<{ %struct.a }>* inalloca) +declare i8* @llvm.stacksave() +declare void @llvm.stackrestore(i8*) Index: llvm/trunk/test/Transforms/ArgumentPromotion/inalloca.ll =================================================================== --- llvm/trunk/test/Transforms/ArgumentPromotion/inalloca.ll +++ llvm/trunk/test/Transforms/ArgumentPromotion/inalloca.ll @@ -1,5 +1,5 @@ -; RUN: opt %s -argpromotion -sroa -S | FileCheck %s -; RUN: opt %s -passes='argpromotion,function(sroa)' -S | FileCheck %s +; RUN: opt %s -globalopt -argpromotion -sroa -S | FileCheck %s +; RUN: opt %s -passes='module(globalopt),cgscc(argpromotion),function(sroa)' -S | FileCheck %s target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" @@ -15,7 +15,7 @@ %r = add i32 %a, %b ret i32 %r } -; CHECK-LABEL: define internal i32 @f +; CHECK-LABEL: define internal fastcc i32 @f ; CHECK-NOT: load ; CHECK: ret @@ -35,7 +35,7 @@ ; Argpromote can't promote %a because of the icmp use. define internal i1 @g(%struct.ss* %a, %struct.ss* inalloca %b) nounwind { -; CHECK: define internal i1 @g(%struct.ss* %a, %struct.ss* inalloca %b) +; CHECK: define internal fastcc i1 @g(%struct.ss* %a, %struct.ss* %b) entry: %c = icmp eq %struct.ss* %a, %b ret i1 %c @@ -45,6 +45,6 @@ entry: %S = alloca inalloca %struct.ss %c = call i1 @g(%struct.ss* %S, %struct.ss* inalloca %S) -; CHECK: call i1 @g(%struct.ss* %S, %struct.ss* inalloca %S) +; CHECK: call fastcc i1 @g(%struct.ss* %S, %struct.ss* %S) ret i32 0 } Index: llvm/trunk/test/Transforms/GlobalOpt/fastcc.ll =================================================================== --- llvm/trunk/test/Transforms/GlobalOpt/fastcc.ll +++ llvm/trunk/test/Transforms/GlobalOpt/fastcc.ll @@ -27,7 +27,7 @@ } define internal i32 @inalloca(i32* inalloca %p) { -; CHECK-LABEL: define internal i32 @inalloca(i32* inalloca %p) +; CHECK-LABEL: define internal fastcc i32 @inalloca(i32* %p) %rv = load i32, i32* %p ret i32 %rv } @@ -52,4 +52,4 @@ ; CHECK: call fastcc i32 @g ; CHECK: call coldcc i32 @h ; CHECK: call i32 @j -; CHECK: call i32 @inalloca(i32* inalloca %args) +; CHECK: call fastcc i32 @inalloca(i32* %args)