diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -797,6 +797,13 @@ /// pointer, and a SRCVALUE. VAEND, VASTART, + // PREALLOCATED_SETUP - This has 2 operands: an input chain and a SRCVALUE + // with the preallocated call Value. + PREALLOCATED_SETUP, + // PREALLOCATED_ARG - This has 3 operands: an input chain, a SRCVALUE + // with the preallocated call Value, and a constant int. + PREALLOCATED_ARG, + /// SRCVALUE - This is a node type that holds a Value* that is used to /// make reference to a value in the LLVM IR. SRCVALUE, diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h --- a/llvm/include/llvm/CodeGen/TargetCallingConv.h +++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h @@ -35,6 +35,7 @@ unsigned IsReturned : 1; ///< Always returned unsigned IsSplit : 1; unsigned IsInAlloca : 1; ///< Passed with inalloca + unsigned IsPreallocated : 1; ///< ByVal without the copy unsigned IsSplitEnd : 1; ///< Last part of a split unsigned IsSwiftSelf : 1; ///< Swift self parameter unsigned IsSwiftError : 1; ///< Swift error parameter @@ -56,9 +57,9 @@ public: ArgFlagsTy() : IsZExt(0), IsSExt(0), IsInReg(0), IsSRet(0), IsByVal(0), IsNest(0), - IsReturned(0), IsSplit(0), IsInAlloca(0), IsSplitEnd(0), - IsSwiftSelf(0), IsSwiftError(0), IsCFGuardTarget(0), IsHva(0), - IsHvaStart(0), IsSecArgPass(0), ByValAlign(0), OrigAlign(0), + IsReturned(0), IsSplit(0), IsInAlloca(0), IsPreallocated(0), + IsSplitEnd(0), IsSwiftSelf(0), IsSwiftError(0), IsCFGuardTarget(0), + IsHva(0), IsHvaStart(0), IsSecArgPass(0), ByValAlign(0), OrigAlign(0), IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0), IsCopyElisionCandidate(0), IsPointer(0), ByValSize(0), PointerAddrSpace(0) { @@ -83,6 +84,9 @@ bool isInAlloca() const { return IsInAlloca; } void setInAlloca() { IsInAlloca = 1; } + bool isPreallocated() const { return IsPreallocated; } + void setPreallocated() { IsPreallocated = 1; } + bool isSwiftSelf() const { return IsSwiftSelf; } void setSwiftSelf() { IsSwiftSelf = 1; } diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -273,17 +273,20 @@ bool IsNest : 1; bool IsByVal : 1; bool IsInAlloca : 1; + bool IsPreallocated : 1; bool IsReturned : 1; bool IsSwiftSelf : 1; bool IsSwiftError : 1; bool IsCFGuardTarget : 1; MaybeAlign Alignment = None; Type *ByValType = nullptr; + Type *PreallocatedType = nullptr; ArgListEntry() : IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false), - IsNest(false), IsByVal(false), IsInAlloca(false), IsReturned(false), - IsSwiftSelf(false), IsSwiftError(false), IsCFGuardTarget(false) {} + IsNest(false), IsByVal(false), IsInAlloca(false), + IsPreallocated(false), IsReturned(false), IsSwiftSelf(false), + IsSwiftError(false), IsCFGuardTarget(false) {} void setAttributes(const CallBase *Call, unsigned ArgIdx); }; @@ -3608,6 +3611,7 @@ bool IsReturnValueUsed : 1; bool IsConvergent : 1; bool IsPatchPoint : 1; + bool IsPreallocated : 1; // IsTailCall should be modified by implementations of // TargetLowering::LowerCall that perform tail call conversions. @@ -3631,7 +3635,7 @@ CallLoweringInfo(SelectionDAG &DAG) : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false), DoesNotReturn(false), IsReturnValueUsed(true), IsConvergent(false), - IsPatchPoint(false), DAG(DAG) {} + IsPatchPoint(false), IsPreallocated(false), DAG(DAG) {} CallLoweringInfo &setDebugLoc(const SDLoc &dl) { DL = dl; @@ -3737,6 +3741,11 @@ return *this; } + CallLoweringInfo &setIsPreallocated(bool Value = true) { + IsPreallocated = Value; + return *this; + } + CallLoweringInfo &setIsPostTypeLegalization(bool Value=true) { IsPostTypeLegalization = Value; return *this; diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h --- a/llvm/include/llvm/IR/Argument.h +++ b/llvm/include/llvm/IR/Argument.h @@ -110,6 +110,9 @@ /// Return true if this argument has the inalloca attribute. bool hasInAllocaAttr() const; + /// Return true if this argument has the preallocated attribute. + bool hasPreallocatedAttr() const; + /// Return true if this argument has the zext attribute. bool hasZExtAttr() const; diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -623,6 +623,9 @@ /// Return the byval type for the specified function parameter. Type *getParamByValType(unsigned ArgNo) const; + /// Return the preallocated type for the specified function parameter. + Type *getParamPreallocatedType(unsigned ArgNo) const; + /// Get the stack alignment. MaybeAlign getStackAlignment(unsigned Index) const; diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -1604,6 +1604,12 @@ return Ty ? Ty : getArgOperand(ArgNo)->getType()->getPointerElementType(); } + /// Extract the preallocated type for a call or parameter. + Type *getParamPreallocatedType(unsigned ArgNo) const { + Type *Ty = Attrs.getParamPreallocatedType(ArgNo); + return Ty ? Ty : getArgOperand(ArgNo)->getType()->getPointerElementType(); + } + /// Extract the number of dereferenceable bytes for a call or /// parameter (0=unknown). uint64_t getDereferenceableBytes(unsigned i) const { diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -127,6 +127,12 @@ /// additionally expand this pseudo after register allocation. HANDLE_TARGET_OPCODE(LOAD_STACK_GUARD) +/// These are used to support call sites that must have the stack adjusted +/// before the call (e.g. to initialize an argument passed by value). +/// See llvm.call.preallocated.{setup,arg} in the LangRef for more details. +HANDLE_TARGET_OPCODE(PREALLOCATED_SETUP) +HANDLE_TARGET_OPCODE(PREALLOCATED_ARG) + /// Call instruction with associated vm state for deoptimization and list /// of live pointers for relocation by the garbage collector. It is /// intended to support garbage collection with fully precise relocating diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1173,6 +1173,18 @@ let hasSideEffects = 0; bit isPseudo = 1; } +def PREALLOCATED_SETUP : StandardPseudoInstruction { + let OutOperandList = (outs); + let InOperandList = (ins i32imm:$a); + let usesCustomInserter = 1; + let hasSideEffects = 1; +} +def PREALLOCATED_ARG : StandardPseudoInstruction { + let OutOperandList = (outs ptr_rc:$loc); + let InOperandList = (ins i32imm:$a, i32imm:$b); + let usesCustomInserter = 1; + let hasSideEffects = 1; +} def LOCAL_ESCAPE : StandardPseudoInstruction { // This instruction is really just a label. It has to be part of the chain so // that it doesn't get dropped from the DAG, but it produces nothing and has diff --git a/llvm/include/llvm/Target/TargetCallingConv.td b/llvm/include/llvm/Target/TargetCallingConv.td --- a/llvm/include/llvm/Target/TargetCallingConv.td +++ b/llvm/include/llvm/Target/TargetCallingConv.td @@ -41,6 +41,11 @@ class CCIfByVal : CCIf<"ArgFlags.isByVal()", A> { } +/// CCIfPreallocated - If the current argument has Preallocated parameter attribute, +/// apply Action A. +class CCIfPreallocated : CCIf<"ArgFlags.isPreallocated()", A> { +} + /// CCIfSwiftSelf - If the current argument has swiftself parameter attribute, /// apply Action A. class CCIfSwiftSelf : CCIf<"ArgFlags.isSwiftSelf()", A> { diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -96,10 +96,12 @@ Flags.setSwiftError(); if (Attrs.hasAttribute(OpIdx, Attribute::ByVal)) Flags.setByVal(); + if (Attrs.hasAttribute(OpIdx, Attribute::Preallocated)) + Flags.setPreallocated(); if (Attrs.hasAttribute(OpIdx, Attribute::InAlloca)) Flags.setInAlloca(); - if (Flags.isByVal() || Flags.isInAlloca()) { + if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) { Type *ElementTy = cast(Arg.Ty)->getElementType(); auto Ty = Attrs.getAttribute(OpIdx, Attribute::ByVal).getValueAsType(); diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1214,7 +1214,16 @@ // the various CC lowering callbacks. Flags.setByVal(); } - if (Arg.IsByVal || Arg.IsInAlloca) { + if (Arg.IsPreallocated) { + Flags.setPreallocated(); + // Set the byval flag for CCAssignFn callbacks that don't know about + // preallocated. This way we can know how many bytes we should've + // allocated and how many bytes a callee cleanup function will pop. If we + // port preallocated to more targets, we'll have to add custom + // preallocated handling in the various CC lowering callbacks. + Flags.setByVal(); + } + if (Arg.IsByVal || Arg.IsInAlloca || Arg.IsPreallocated) { PointerType *Ty = cast(Arg.Ty); Type *ElementTy = Ty->getElementType(); unsigned FrameSize = diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1873,9 +1873,6 @@ } SDValue SelectionDAG::getSrcValue(const Value *V) { - assert((!V || V->getType()->isPointerTy()) && - "SrcValue is not a pointer?"); - FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None); ID.AddPointer(V); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5607,6 +5607,23 @@ LowerCallTo(I, Callee, I.isTailCall()); } +/// Given a @llvm.call.preallocated.setup, return the corresponding +/// preallocated call. +static const CallBase *FindPreallocatedCall(const Value *PreallocatedSetup) { + assert(cast(PreallocatedSetup) + ->getCalledFunction() + ->getIntrinsicID() == Intrinsic::call_preallocated_setup && + "expected call_preallocated_setup Value"); + for (auto *U : PreallocatedSetup->users()) { + auto *UseCall = cast(U); + const Function *Fn = UseCall->getCalledFunction(); + if (!Fn || Fn->getIntrinsicID() != Intrinsic::call_preallocated_arg) { + return UseCall; + } + } + llvm_unreachable("expected corresponding call to preallocated setup/arg"); +} + /// Lower the call to the specified intrinsic function. void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { @@ -5799,6 +5816,30 @@ updateDAGForMaybeTailCall(MC); return; } + case Intrinsic::call_preallocated_setup: { + const CallBase *PreallocatedCall = FindPreallocatedCall(&I); + SDValue SrcValue = DAG.getSrcValue(PreallocatedCall); + SDValue Res = DAG.getNode(ISD::PREALLOCATED_SETUP, sdl, MVT::Other, + getRoot(), SrcValue); + setValue(&I, Res); + DAG.setRoot(Res); + return; + } + case Intrinsic::call_preallocated_arg: { + const CallBase *PreallocatedCall = FindPreallocatedCall(I.getOperand(0)); + SDValue SrcValue = DAG.getSrcValue(PreallocatedCall); + SDValue Ops[3]; + Ops[0] = getRoot(); + Ops[1] = SrcValue; + Ops[2] = DAG.getTargetConstant(*cast(I.getArgOperand(1)), sdl, + MVT::i32); // arg index + SDValue Res = DAG.getNode( + ISD::PREALLOCATED_ARG, sdl, + DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other), Ops); + setValue(&I, Res); + DAG.setRoot(Res.getValue(1)); + return; + } case Intrinsic::dbg_addr: case Intrinsic::dbg_declare: { const auto &DI = cast(I); @@ -7119,7 +7160,9 @@ .setChain(getRoot()) .setCallee(RetTy, FTy, Callee, std::move(Args), CB) .setTailCall(isTailCall) - .setConvergent(CB.isConvergent()); + .setConvergent(CB.isConvergent()) + .setIsPreallocated( + CB.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0); std::pair Result = lowerInvokable(CLI, EHPadBB); if (Result.first.getNode()) { @@ -7645,9 +7688,9 @@ // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't // have to do anything here to lower funclet bundles. // CFGuardTarget bundles are lowered in LowerCallTo. - assert(!I.hasOperandBundlesOtherThan({LLVMContext::OB_deopt, - LLVMContext::OB_funclet, - LLVMContext::OB_cfguardtarget}) && + assert(!I.hasOperandBundlesOtherThan( + {LLVMContext::OB_deopt, LLVMContext::OB_funclet, + LLVMContext::OB_cfguardtarget, LLVMContext::OB_preallocated}) && "Cannot lower calls with arbitrary operand bundles!"); SDValue Callee = getValue(I.getCalledOperand()); @@ -8608,7 +8651,9 @@ .setChain(getRoot()) .setCallee(Call->getCallingConv(), ReturnTy, Callee, std::move(Args)) .setDiscardResult(Call->use_empty()) - .setIsPatchPoint(IsPatchPoint); + .setIsPatchPoint(IsPatchPoint) + .setIsPreallocated( + Call->countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0); } /// Add a stack map intrinsic call's live variable operands to a stackmap @@ -9128,6 +9173,15 @@ Flags.setCFGuardTarget(); if (Args[i].IsByVal) Flags.setByVal(); + if (Args[i].IsPreallocated) { + Flags.setPreallocated(); + // Set the byval flag for CCAssignFn callbacks that don't know about + // preallocated. This way we can know how many bytes we should've + // allocated and how many bytes a callee cleanup function will pop. If + // we port preallocated to more targets, we'll have to add custom + // preallocated handling in the various CC lowering callbacks. + Flags.setByVal(); + } if (Args[i].IsInAlloca) { Flags.setInAlloca(); // Set the byval flag for CCAssignFn callbacks that don't know about @@ -9137,7 +9191,7 @@ // in the various CC lowering callbacks. Flags.setByVal(); } - if (Args[i].IsByVal || Args[i].IsInAlloca) { + if (Args[i].IsByVal || Args[i].IsInAlloca || Args[i].IsPreallocated) { PointerType *Ty = cast(Args[i].Ty); Type *ElementTy = Ty->getElementType(); @@ -9451,7 +9505,7 @@ // initializes the alloca. Don't elide copies from the same argument twice. const Value *Val = SI->getValueOperand()->stripPointerCasts(); const auto *Arg = dyn_cast(Val); - if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() || + if (!Arg || Arg->hasPassPointeeByValueAttr() || Arg->getType()->isEmptyTy() || DL.getTypeStoreSize(Arg->getType()) != DL.getTypeAllocSize(AI->getAllocatedType()) || @@ -9638,12 +9692,21 @@ // in the various CC lowering callbacks. Flags.setByVal(); } + if (Arg.hasAttribute(Attribute::Preallocated)) { + Flags.setPreallocated(); + // Set the byval flag for CCAssignFn callbacks that don't know about + // preallocated. This way we can know how many bytes we should've + // allocated and how many bytes a callee cleanup function will pop. If + // we port preallocated to more targets, we'll have to add custom + // preallocated handling in the various CC lowering callbacks. + Flags.setByVal(); + } if (F.getCallingConv() == CallingConv::X86_INTR) { // IA Interrupt passes frame (1st parameter) by value in the stack. if (ArgNo == 0) Flags.setByVal(); } - if (Flags.isByVal() || Flags.isInAlloca()) { + if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) { Type *ElementTy = Arg.getParamByValType(); // For ByVal, size and alignment should be passed from FE. BE will diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -393,6 +393,10 @@ case ISD::GC_TRANSITION_END: return "gc_transition.end"; case ISD::GET_DYNAMIC_AREA_OFFSET: return "get.dynamic.area.offset"; case ISD::FREEZE: return "freeze"; + case ISD::PREALLOCATED_SETUP: + return "call_setup"; + case ISD::PREALLOCATED_ARG: + return "call_alloc"; // Bit manipulation case ISD::ABS: return "abs"; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -110,14 +110,18 @@ IsSRet = Call->paramHasAttr(ArgIdx, Attribute::StructRet); IsNest = Call->paramHasAttr(ArgIdx, Attribute::Nest); IsByVal = Call->paramHasAttr(ArgIdx, Attribute::ByVal); + IsPreallocated = Call->paramHasAttr(ArgIdx, Attribute::Preallocated); IsInAlloca = Call->paramHasAttr(ArgIdx, Attribute::InAlloca); IsReturned = Call->paramHasAttr(ArgIdx, Attribute::Returned); IsSwiftSelf = Call->paramHasAttr(ArgIdx, Attribute::SwiftSelf); IsSwiftError = Call->paramHasAttr(ArgIdx, Attribute::SwiftError); Alignment = Call->getParamAlign(ArgIdx); ByValType = nullptr; - if (Call->paramHasAttr(ArgIdx, Attribute::ByVal)) + if (IsByVal) ByValType = Call->getParamByValType(ArgIdx); + PreallocatedType = nullptr; + if (IsPreallocated) + PreallocatedType = Call->getParamPreallocatedType(ArgIdx); } /// Generate a libcall taking the given operands as arguments and returning a diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -1433,6 +1433,10 @@ return getAttributes(Index+FirstArgIndex).getByValType(); } +Type *AttributeList::getParamPreallocatedType(unsigned Index) const { + return getAttributes(Index + FirstArgIndex).getPreallocatedType(); +} + MaybeAlign AttributeList::getStackAlignment(unsigned Index) const { return getAttributes(Index).getStackAlignment(); } diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -114,6 +114,12 @@ return hasAttribute(Attribute::InAlloca); } +bool Argument::hasPreallocatedAttr() const { + if (!getType()->isPointerTy()) + return false; + return hasAttribute(Attribute::Preallocated); +} + bool Argument::hasPassPointeeByValueAttr() const { if (!getType()->isPointerTy()) return false; AttributeList Attrs = getParent()->getAttributes(); diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -789,8 +789,9 @@ /// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP /// values are spilled on the stack. def CC_X86_32_Common : CallingConv<[ - // Handles byval parameters. + // Handles byval/preallocated parameters. CCIfByVal>, + CCIfPreallocated>, // The first 3 float or double arguments, if marked 'inreg' and if the call // is not a vararg call and if SSE2 is available, are passed in SSE registers. diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -3245,7 +3245,7 @@ return false; for (auto Flag : CLI.OutFlags) - if (Flag.isSwiftError()) + if (Flag.isSwiftError() || Flag.isPreallocated()) return false; SmallVector OutVTs; diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -57,7 +57,8 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { return !MF.getFrameInfo().hasVarSizedObjects() && - !MF.getInfo()->getHasPushSequences(); + !MF.getInfo()->getHasPushSequences() && + !MF.getInfo()->hasPreallocatedCall(); } /// canSimplifyCallFramePseudos - If there is a reserved call frame, the @@ -67,6 +68,7 @@ bool X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { return hasReservedCallFrame(MF) || + MF.getInfo()->hasPreallocatedCall() || (hasFP(MF) && !TRI->needsStackRealignment(MF)) || TRI->hasBasePointer(MF); } @@ -90,10 +92,10 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || - TRI->needsStackRealignment(MF) || - MFI.hasVarSizedObjects() || + TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() || MF.getInfo()->getForceFramePointer() || + MF.getInfo()->hasPreallocatedCall() || MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() || MFI.hasStackMap() || MFI.hasPatchPoint() || MFI.hasCopyImplyingStackAdjustment()); diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -5625,6 +5625,39 @@ CurDAG->RemoveDeadNode(Node); return; } + case ISD::PREALLOCATED_SETUP: { + auto MFI = CurDAG->getMachineFunction().getInfo(); + auto CallId = MFI->PreallocatedIdForCallSite( + cast(Node->getOperand(1))->getValue()); + SDValue Chain = Node->getOperand(0); + SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); + MachineSDNode *New = CurDAG->getMachineNode( + TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain); + ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain + CurDAG->RemoveDeadNode(Node); + return; + } + case ISD::PREALLOCATED_ARG: { + auto MFI = CurDAG->getMachineFunction().getInfo(); + auto CallId = MFI->PreallocatedIdForCallSite( + cast(Node->getOperand(1))->getValue()); + SDValue Chain = Node->getOperand(0); + SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); + SDValue ArgIndex = Node->getOperand(2); + SDValue Ops[3]; + Ops[0] = CallIdValue; + Ops[1] = ArgIndex; + Ops[2] = Chain; + MachineSDNode *New = CurDAG->getMachineNode( + TargetOpcode::PREALLOCATED_ARG, dl, + CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()), + MVT::Other), + Ops); + ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer + ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain + CurDAG->RemoveDeadNode(Node); + return; + } } SelectCode(Node); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3882,6 +3882,21 @@ if (ArgLocs.back().getLocMemOffset() != 0) report_fatal_error("any parameter with the inalloca attribute must be " "the only memory argument"); + } else if (CLI.IsPreallocated) { + assert(ArgLocs.back().isMemLoc() && + "cannot use preallocated attribute on a register " + "parameter"); + SmallVector PreallocatedOffsets; + for (size_t i = 0; i < CLI.OutVals.size(); ++i) { + if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) { + PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset()); + } + } + auto MFI = DAG.getMachineFunction().getInfo(); + size_t PreallocatedId = MFI->PreallocatedIdForCallSite(CLI.CB); + MFI->SetPreallocatedStackSize(PreallocatedId, NumBytes); + MFI->SetPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets); + NumBytesToPush = 0; } if (!IsSibcall && !IsMustTail) @@ -3909,9 +3924,9 @@ for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; ++I, ++OutIndex) { assert(OutIndex < Outs.size() && "Invalid Out index"); - // Skip inalloca arguments, they have already been written. + // Skip inalloca/preallocated arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; - if (Flags.isInAlloca()) + if (Flags.isInAlloca() || Flags.isPreallocated()) continue; CCValAssign &VA = ArgLocs[I]; @@ -4099,8 +4114,8 @@ assert(VA.isMemLoc()); SDValue Arg = OutVals[OutsIndex]; ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; - // Skip inalloca arguments. They don't require any work. - if (Flags.isInAlloca()) + // Skip inalloca/preallocated arguments. They don't require any work. + if (Flags.isInAlloca() || Flags.isPreallocated()) continue; // Create frame index. int32_t Offset = VA.getLocMemOffset()+FPDiff; @@ -33061,6 +33076,36 @@ BB->addLiveIn(BasePtr); return BB; } + case TargetOpcode::PREALLOCATED_SETUP: { + assert(Subtarget.is32Bit() && "preallocated only used in 32-bit"); + auto MFI = MF->getInfo(); + MFI->setHasPreallocatedCall(true); + int64_t PreallocatedId = MI.getOperand(0).getImm(); + size_t StackAdjustment = MFI->GetPreallocatedStackSize(PreallocatedId); + assert(StackAdjustment != 0 && "0 stack adjustment"); + LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment " + << StackAdjustment << "\n"); + BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP) + .addReg(X86::ESP) + .addImm(StackAdjustment); + MI.eraseFromParent(); + return BB; + } + case TargetOpcode::PREALLOCATED_ARG: { + assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit"); + int64_t PreallocatedId = MI.getOperand(1).getImm(); + int64_t ArgIdx = MI.getOperand(2).getImm(); + auto MFI = MF->getInfo(); + size_t ArgOffset = MFI->GetPreallocatedArgOffsets(PreallocatedId)[ArgIdx]; + LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx + << ", arg offset " << ArgOffset << "\n"); + // stack pointer + offset + addRegOffset( + BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()), + X86::ESP, false, ArgOffset); + MI.eraseFromParent(); + return BB; + } } } diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H #define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" @@ -103,6 +104,13 @@ /// True if this function has WIN_ALLOCA instructions. bool HasWinAlloca = false; + /// True if this function has any preallocated calls. + bool HasPreallocatedCall = false; + + ValueMap PreallocatedIds; + SmallVector PreallocatedStackSizes; + SmallVector, 0> PreallocatedArgOffsets; + private: /// ForwardedMustTailRegParms - A list of virtual and physical registers /// that must be forwarded to every musttail call. @@ -184,6 +192,36 @@ bool hasWinAlloca() const { return HasWinAlloca; } void setHasWinAlloca(bool v) { HasWinAlloca = v; } + + bool hasPreallocatedCall() const { return HasPreallocatedCall; } + void setHasPreallocatedCall(bool v) { HasPreallocatedCall = v; } + + size_t PreallocatedIdForCallSite(const Value *CS) { + auto Insert = PreallocatedIds.insert({CS, PreallocatedIds.size()}); + if (Insert.second) { + PreallocatedStackSizes.push_back(0); + PreallocatedArgOffsets.emplace_back(); + } + return Insert.first->second; + } + + void SetPreallocatedStackSize(size_t Id, size_t StackSize) { + PreallocatedStackSizes[Id] = StackSize; + } + + size_t GetPreallocatedStackSize(const size_t Id) { + assert(PreallocatedStackSizes[Id] != 0 && "stack size not set"); + return PreallocatedStackSizes[Id]; + } + + void SetPreallocatedArgOffsets(size_t Id, SmallVector AO) { + PreallocatedArgOffsets[Id] = AO; + } + + const SmallVector &GetPreallocatedArgOffsets(const size_t Id) { + assert(!PreallocatedArgOffsets[Id].empty() && "arg offsets not set"); + return PreallocatedArgOffsets[Id]; + } }; } // End llvm namespace diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -627,18 +627,22 @@ } bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - - if (!EnableBasePointer) - return false; - - // When we need stack realignment, we can't address the stack from the frame - // pointer. When we have dynamic allocas or stack-adjusting inline asm, we - // can't address variables from the stack pointer. MS inline asm can - // reference locals while also adjusting the stack pointer. When we can't - // use both the SP and the FP, we need a separate base pointer register. - bool CantUseFP = needsStackRealignment(MF); - return CantUseFP && CantUseSP(MFI); + const X86MachineFunctionInfo *X86FI = MF.getInfo(); + if (X86FI->hasPreallocatedCall()) + return true; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + if (!EnableBasePointer) + return false; + + // When we need stack realignment, we can't address the stack from the frame + // pointer. When we have dynamic allocas or stack-adjusting inline asm, we + // can't address variables from the stack pointer. MS inline asm can + // reference locals while also adjusting the stack pointer. When we can't + // use both the SP and the FP, we need a separate base pointer register. + bool CantUseFP = needsStackRealignment(MF); + return CantUseFP && CantUseSP(MFI); } bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -1014,9 +1014,9 @@ // CI should not has any ABI-impacting function attributes. static const Attribute::AttrKind ABIAttrs[] = { - Attribute::StructRet, Attribute::ByVal, Attribute::InAlloca, - Attribute::InReg, Attribute::Returned, Attribute::SwiftSelf, - Attribute::SwiftError}; + Attribute::StructRet, Attribute::ByVal, Attribute::InAlloca, + Attribute::Preallocated, Attribute::InReg, Attribute::Returned, + Attribute::SwiftSelf, Attribute::SwiftError}; AttributeList Attrs = CI.getAttributes(); for (auto AK : ABIAttrs) if (Attrs.hasParamAttribute(0, AK)) diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -1372,7 +1372,8 @@ AttributeList FnAttributeList = Fn->getAttributes(); if (FnAttributeList.hasAttrSomewhere(Attribute::Nest) || FnAttributeList.hasAttrSomewhere(Attribute::StructRet) || - FnAttributeList.hasAttrSomewhere(Attribute::InAlloca)) { + FnAttributeList.hasAttrSomewhere(Attribute::InAlloca) || + FnAttributeList.hasAttrSomewhere(Attribute::Preallocated)) { LLVM_DEBUG( dbgs() << "[Attributor] Cannot rewrite due to complex attribute\n"); return false; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -4361,7 +4361,8 @@ AAValueSimplifyImpl::initialize(A); if (!getAnchorScope() || getAnchorScope()->isDeclaration()) indicatePessimisticFixpoint(); - if (hasAttr({Attribute::InAlloca, Attribute::StructRet, Attribute::Nest}, + if (hasAttr({Attribute::InAlloca, Attribute::Preallocated, + Attribute::StructRet, Attribute::Nest}, /* IgnoreSubsumingPositions */ true)) indicatePessimisticFixpoint(); @@ -5588,7 +5589,7 @@ // TODO: From readattrs.ll: "inalloca parameters are always // considered written" - if (hasAttr({Attribute::InAlloca})) { + if (hasAttr({Attribute::InAlloca, Attribute::Preallocated})) { removeKnownBits(NO_WRITES); removeAssumedBits(NO_WRITES); } diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -483,9 +483,10 @@ // We consider arguments of non-internal functions to be intrinsically alive as // well as arguments to functions which have their "address taken". void DeadArgumentEliminationPass::SurveyFunction(const Function &F) { - // Functions with inalloca parameters are expecting args in a particular - // register and memory layout. - if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca)) { + // Functions with inalloca/preallocated parameters are expecting args in a + // particular register and memory layout. + if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca) || + F.getAttributes().hasAttrSomewhere(Attribute::Preallocated)) { MarkLive(F); return; } diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -447,7 +447,7 @@ SmallPtrSet Visited; // inalloca arguments are always clobbered by the call. - if (A->hasInAllocaAttr()) + if (A->hasInAllocaAttr() || A->hasPreallocatedAttr()) return Attribute::None; bool IsRead = false; diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2333,6 +2333,7 @@ // wouldn't be safe in the presence of inalloca. // FIXME: We should also hoist alloca affected by this to the entry // block if possible. + // FIXME: handle preallocated if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) && !F->hasAddressTaken()) { RemoveAttribute(F, Attribute::InAlloca); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4740,6 +4740,7 @@ // // Similarly, avoid folding away bitcasts of byval calls. if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) || + Callee->getAttributes().hasAttrSomewhere(Attribute::Preallocated) || Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal)) return false; diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll --- a/llvm/test/CodeGen/X86/arg-copy-elide.ll +++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll @@ -253,6 +253,20 @@ ; CHECK: calll _addrof_i32 ; CHECK: retl +define void @avoid_preallocated(i32* preallocated(i32) %x) { +entry: + %x.p.p = alloca i32* + store i32* %x, i32** %x.p.p + call void @addrof_i32(i32* %x) + ret void +} + +; CHECK-LABEL: _avoid_preallocated: +; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _addrof_i32 +; CHECK: retl + ; Don't elide the copy when the alloca is escaped with a store. define void @escape_with_store(i32 %x) { %x1 = alloca i32 diff --git a/llvm/test/CodeGen/X86/musttail-indirect.ll b/llvm/test/CodeGen/X86/musttail-indirect.ll --- a/llvm/test/CodeGen/X86/musttail-indirect.ll +++ b/llvm/test/CodeGen/X86/musttail-indirect.ll @@ -22,6 +22,9 @@ ; Each member pointer creates a thunk. The ones with inalloca are required to ; tail calls by the ABI, even at O0. +declare token @llvm.call.preallocated.setup(i32) +declare i8* @llvm.call.preallocated.arg(token, i32) + %struct.B = type { i32 (...)** } %struct.A = type { i32 } @@ -52,6 +55,25 @@ ret i32 %3 } +; FIXME: This generates a lot of code even at -O2, any better way to do this? Same with all the preallocated versions of functions below. +; CHECK-LABEL: g_thunk_2: +; CHECK: jmpl +; CHECK-NOT: ret +define x86_thiscallcc i32 @g_thunk_2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>) %0) { +entry: + %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** + %vtable = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)**, i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1 + %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 1 + %2 = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn + %tmp = load <{ %struct.A, i32, %struct.A }>, <{ %struct.A, i32, %struct.A }>* %0 + %c = call token @llvm.call.preallocated.setup(i32 1) + %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{ %struct.A, i32, %struct.A }>) + %a = bitcast i8* %A to <{ %struct.A, i32, %struct.A }>* + store <{ %struct.A, i32, %struct.A }> %tmp, <{ %struct.A, i32, %struct.A }>* %a + %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>) %a) ["preallocated"(token %c)] + ret i32 %3 +} + ; CHECK-LABEL: h_thunk: ; CHECK: jmpl ; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}} @@ -66,6 +88,22 @@ ret void } +; CHECK-LABEL: h_thunk_2: +; CHECK: jmpl +; CHECK-NOT: ret +define x86_thiscallcc void @h_thunk_2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>)) { +entry: + %1 = bitcast %struct.B* %this to void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** + %vtable = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)**, void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1 + %vfn = getelementptr inbounds void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 2 + %2 = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn + %c = call token @llvm.call.preallocated.setup(i32 1) + %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{ %struct.A, i32, %struct.A }>) + %a = bitcast i8* %A to <{ %struct.A, i32, %struct.A }>* + musttail call x86_thiscallcc void %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>) %a) ["preallocated"(token %c)] + ret void +} + ; CHECK-LABEL: i_thunk: ; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}} ; CHECK: jmpl @@ -80,6 +118,22 @@ ret %struct.A* %3 } +; CHECK-LABEL: i_thunk_2: +; CHECK: jmpl +; CHECK-NOT: ret +define x86_thiscallcc %struct.A* @i_thunk_2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A*, %struct.A, i32, %struct.A }>)) { +entry: + %1 = bitcast %struct.B* %this to %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** + %vtable = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)**, %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** %1 + %vfn = getelementptr inbounds %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*, %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vtable, i32 3 + %2 = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*, %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vfn + %c = call token @llvm.call.preallocated.setup(i32 1) + %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{ %struct.A, i32, %struct.A }>) + %a = bitcast i8* %A to <{ %struct.A*, %struct.A, i32, %struct.A }>* + %3 = musttail call x86_thiscallcc %struct.A* %2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A*, %struct.A, i32, %struct.A }>) %a) ["preallocated"(token %c)] + ret %struct.A* %3 +} + ; CHECK-LABEL: j_thunk: ; CHECK: jmpl ; CHECK-NOT: ret @@ -109,6 +163,24 @@ ret i32 %3 } +; CHECK-LABEL: _stdcall_thunk_2@8: +; CHECK: jmpl +; CHECK-NOT: ret +define x86_stdcallcc i32 @stdcall_thunk_2(<{ %struct.B*, %struct.A }>* preallocated(<{ %struct.B*, %struct.A }>)) { +entry: + %this_ptr = getelementptr inbounds <{ %struct.B*, %struct.A }>, <{ %struct.B*, %struct.A }>* %0, i32 0, i32 0 + %this = load %struct.B*, %struct.B** %this_ptr + %1 = bitcast %struct.B* %this to i32 (<{ %struct.B*, %struct.A }>*)*** + %vtable = load i32 (<{ %struct.B*, %struct.A }>*)**, i32 (<{ %struct.B*, %struct.A }>*)*** %1 + %vfn = getelementptr inbounds i32 (<{ %struct.B*, %struct.A }>*)*, i32 (<{ %struct.B*, %struct.A }>*)** %vtable, i32 1 + %2 = load i32 (<{ %struct.B*, %struct.A }>*)*, i32 (<{ %struct.B*, %struct.A }>*)** %vfn + %c = call token @llvm.call.preallocated.setup(i32 1) + %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{ %struct.B*, %struct.A }>) + %a = bitcast i8* %A to <{ %struct.B*, %struct.A }>* + %3 = musttail call x86_stdcallcc i32 %2(<{ %struct.B*, %struct.A }>* preallocated(<{ %struct.B*, %struct.A }>) %a) ["preallocated"(token %c)] + ret i32 %3 +} + ; CHECK-LABEL: @fastcall_thunk@8: ; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}} ; CHECK: jmpl @@ -122,3 +194,19 @@ %3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* inalloca %0) ret i32 %3 } + +; CHECK-LABEL: @fastcall_thunk_2@8: +; CHECK: jmpl +; CHECK-NOT: ret +define x86_fastcallcc i32 @fastcall_thunk_2(%struct.B* inreg %this, <{ %struct.A }>* preallocated(<{%struct.A}>)) { +entry: + %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A }>*)*** + %vtable = load i32 (%struct.B*, <{ %struct.A }>*)**, i32 (%struct.B*, <{ %struct.A }>*)*** %1 + %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A }>*)** %vtable, i32 1 + %2 = load i32 (%struct.B*, <{ %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A }>*)** %vfn + %c = call token @llvm.call.preallocated.setup(i32 1) + %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{ %struct.A }>) + %a = bitcast i8* %A to <{ %struct.A }>* + %3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* preallocated(<{ %struct.A }>) %a) ["preallocated"(token %c)] + ret i32 %3 +} diff --git a/llvm/test/CodeGen/X86/musttail-thiscall.ll b/llvm/test/CodeGen/X86/musttail-thiscall.ll --- a/llvm/test/CodeGen/X86/musttail-thiscall.ll +++ b/llvm/test/CodeGen/X86/musttail-thiscall.ll @@ -1,6 +1,9 @@ ; RUN: llc -verify-machineinstrs -mtriple=i686-- < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=i686-- -O0 < %s | FileCheck %s +declare token @llvm.call.preallocated.setup(i32) +declare i8* @llvm.call.preallocated.arg(token, i32) + ; CHECK-LABEL: t1: ; CHECK: jmp {{_?}}t1_callee define x86_thiscallcc void @t1(i8* %this) { @@ -29,3 +32,19 @@ ret i8* %rv } declare x86_thiscallcc i8* @t3_callee(i8* %this, <{ i8*, i32 }>* inalloca %args); + +; CHECK-LABEL: t4: +; CHECK: jmp {{_?}}t4_callee +define x86_thiscallcc i8* @t4(i8* %this, <{ i8*, i32 }>* preallocated(<{i8*, i32}>) %args) { + %adj = getelementptr i8, i8* %this, i32 4 + %a_ptr = getelementptr <{ i8*, i32 }>, <{ i8*, i32 }>* %args, i32 0, i32 1 + store i32 0, i32* %a_ptr + %c = call token @llvm.call.preallocated.setup(i32 1) + %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{i8*, i32}>) + %a = bitcast i8* %A to <{ i8*, i32 }>* + %tmp = load <{ i8*, i32 }>, <{ i8*, i32 }>* %args + store <{ i8*, i32 }> %tmp, <{ i8*, i32 }>* %a + %rv = musttail call x86_thiscallcc i8* @t4_callee(i8* %adj, <{ i8*, i32 }>* preallocated(<{ i8*, i32 }>) %a) ["preallocated"(token %c)] + ret i8* %rv +} +declare x86_thiscallcc i8* @t4_callee(i8* %this, <{ i8*, i32 }>* preallocated(<{i8*, i32}>) %args); diff --git a/llvm/test/CodeGen/X86/preallocated-nocall.ll b/llvm/test/CodeGen/X86/preallocated-nocall.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/preallocated-nocall.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s +; XFAIL: * + +declare token @llvm.call.preallocated.setup(i32) +declare i8* @llvm.call.preallocated.arg(token, i32) + +%Foo = type { i32, i32 } + +declare void @init(%Foo*) + + + +declare void @foo_p(%Foo* preallocated(%Foo)) + +define void @no_call() { +; CHECK-LABEL: _no_call: + %t = call token @llvm.call.preallocated.setup(i32 1) + %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo) + %b = bitcast i8* %a to %Foo* + call void @init(%Foo* %b) + ret void +} diff --git a/llvm/test/CodeGen/X86/preallocated-x64.ll b/llvm/test/CodeGen/X86/preallocated-x64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/preallocated-x64.ll @@ -0,0 +1,17 @@ +; RUN: llc %s -mtriple=x86_64-windows-msvc -o /dev/null 2>&1 +; XFAIL: * + +declare token @llvm.call.preallocated.setup(i32) +declare i8* @llvm.call.preallocated.arg(token, i32) + +%Foo = type { i32, i32 } + +declare x86_thiscallcc void @f(i32, %Foo* preallocated(%Foo)) + +define void @g() { + %t = call token @llvm.call.preallocated.setup(i32 1) + %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo) + %b = bitcast i8* %a to %Foo* + call void @f(i32 0, %Foo* preallocated(%Foo) %b) ["preallocated"(token %t)] + ret void +} diff --git a/llvm/test/CodeGen/X86/preallocated.ll b/llvm/test/CodeGen/X86/preallocated.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/preallocated.ll @@ -0,0 +1,187 @@ +; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s + +declare token @llvm.call.preallocated.setup(i32) +declare i8* @llvm.call.preallocated.arg(token, i32) + +%Foo = type { i32, i32 } + +declare void @init(%Foo*) + + + +declare void @foo_p(%Foo* preallocated(%Foo)) + +define void @one_preallocated() { +; CHECK-LABEL: _one_preallocated: + %t = call token @llvm.call.preallocated.setup(i32 1) + %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo) + %b = bitcast i8* %a to %Foo* +; CHECK: subl $8, %esp +; CHECK: calll _foo_p + call void @foo_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)] + ret void +} + +define void @one_preallocated_two_blocks() { +; CHECK-LABEL: _one_preallocated_two_blocks: + %t = call token @llvm.call.preallocated.setup(i32 1) + br label %second +second: + %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo) + %b = bitcast i8* %a to %Foo* +; CHECK: subl $8, %esp +; CHECK: calll _foo_p + call void @foo_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)] + ret void +} + +define void @preallocated_with_store() { +; CHECK-LABEL: _preallocated_with_store: +; CHECK: subl $8, %esp + %t = call token @llvm.call.preallocated.setup(i32 1) +; CHECK: leal (%esp), [[REGISTER:%[a-z]+]] + %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo) + %b = bitcast i8* %a to %Foo* + %p0 = getelementptr %Foo, %Foo* %b, i32 0, i32 0 + %p1 = getelementptr %Foo, %Foo* %b, i32 0, i32 1 + store i32 13, i32* %p0 + store i32 42, i32* %p1 +; CHECK-DAG: movl $13, ([[REGISTER]]) +; CHECK-DAG: movl $42, 4([[REGISTER]]) +; CHECK-NOT: subl {{\$[0-9]+}}, %esp +; CHECK-NOT: pushl +; CHECK: calll _foo_p + call void @foo_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)] + ret void +} + +define void @preallocated_with_init() { +; CHECK-LABEL: _preallocated_with_init: +; CHECK: subl $8, %esp + %t = call token @llvm.call.preallocated.setup(i32 1) +; CHECK: leal (%esp), [[REGISTER:%[a-z]+]] + %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo) + %b = bitcast i8* %a to %Foo* +; CHECK: pushl [[REGISTER]] +; CHECK: calll _init + call void @init(%Foo* %b) +; CHECK-NOT: subl {{\$[0-9]+}}, %esp +; CHECK-NOT: pushl +; CHECK: calll _foo_p + call void @foo_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)] + ret void +} + +declare void @foo_p_p(%Foo* preallocated(%Foo), %Foo* preallocated(%Foo)) + +define void @two_preallocated() { +; CHECK-LABEL: _two_preallocated: + %t = call token @llvm.call.preallocated.setup(i32 2) + %a1 = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo) + %b1 = bitcast i8* %a1 to %Foo* + %a2 = call i8* @llvm.call.preallocated.arg(token %t, i32 1) preallocated(%Foo) + %b2 = bitcast i8* %a2 to %Foo* +; CHECK: subl $16, %esp +; CHECK: calll _foo_p_p + call void @foo_p_p(%Foo* preallocated(%Foo) %b1, %Foo* preallocated(%Foo) %b2) ["preallocated"(token %t)] + ret void +} + +declare void @foo_p_int(%Foo* preallocated(%Foo), i32) + +define void @one_preallocated_one_normal() { +; CHECK-LABEL: _one_preallocated_one_normal: +; CHECK: subl $12, %esp + %t = call token @llvm.call.preallocated.setup(i32 1) +; CHECK: leal (%esp), [[REGISTER:%[a-z]+]] + %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo) + %b = bitcast i8* %a to %Foo* +; CHECK: pushl [[REGISTER]] +; CHECK: calll _init + call void @init(%Foo* %b) +; CHECK-NOT: subl {{\$[0-9]+}}, %esp +; CHECK-NOT: pushl +; CHECK: movl $2, 8(%esp) +; CHECK: calll _foo_p_int + call void @foo_p_int(%Foo* preallocated(%Foo) %b, i32 2) ["preallocated"(token %t)] + ret void +} + +declare void @foo_ret_p(%Foo* sret, %Foo* preallocated(%Foo)) + +define void @nested_with_init() { +; CHECK-LABEL: _nested_with_init: + %tmp = alloca %Foo + + %t1 = call token @llvm.call.preallocated.setup(i32 1) +; CHECK: subl $12, %esp + %a1 = call i8* @llvm.call.preallocated.arg(token %t1, i32 0) preallocated(%Foo) + %b1 = bitcast i8* %a1 to %Foo* +; CHECK: leal 4(%esp), [[REGISTER1:%[a-z]+]] + + %t2 = call token @llvm.call.preallocated.setup(i32 1) +; CHECK: subl $12, %esp + %a2 = call i8* @llvm.call.preallocated.arg(token %t2, i32 0) preallocated(%Foo) +; CHECK: leal 4(%esp), [[REGISTER2:%[a-z]+]] + %b2 = bitcast i8* %a2 to %Foo* + + call void @init(%Foo* %b2) +; CHECK: pushl [[REGISTER2]] +; CHECK: calll _init + + call void @foo_ret_p(%Foo* %b1, %Foo* preallocated(%Foo) %b2) ["preallocated"(token %t2)] +; CHECK-NOT: subl {{\$[0-9]+}}, %esp +; CHECK-NOT: pushl +; CHECK: calll _foo_ret_p + call void @foo_ret_p(%Foo* %tmp, %Foo* preallocated(%Foo) %b1) ["preallocated"(token %t1)] +; CHECK-NOT: subl {{\$[0-9]+}}, %esp +; CHECK-NOT: pushl +; CHECK: calll _foo_ret_p + ret void +} + +declare void @foo_inreg_p(i32 inreg, %Foo* preallocated(%Foo)) + +define void @inreg() { +; CHECK-LABEL: _inreg: + %t = call token @llvm.call.preallocated.setup(i32 1) + %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo) + %b = bitcast i8* %a to %Foo* +; CHECK: subl $8, %esp +; CHECK: movl $9, %eax +; CHECK: calll _foo_inreg_p + call void @foo_inreg_p(i32 9, %Foo* preallocated(%Foo) %b) ["preallocated"(token %t)] + ret void +} + +declare x86_thiscallcc void @foo_thiscall_p(i8*, %Foo* preallocated(%Foo)) + +define void @thiscall() { +; CHECK-LABEL: _thiscall: + %t = call token @llvm.call.preallocated.setup(i32 1) + %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo) + %b = bitcast i8* %a to %Foo* +; CHECK: subl $8, %esp +; CHECK: xorl %ecx, %ecx +; CHECK: calll _foo_thiscall_p + call x86_thiscallcc void @foo_thiscall_p(i8* null, %Foo* preallocated(%Foo) %b) ["preallocated"(token %t)] + ret void +} + +declare x86_stdcallcc void @foo_stdcall_p(%Foo* preallocated(%Foo)) +declare x86_stdcallcc void @i(i32) + +define void @stdcall() { +; CHECK-LABEL: _stdcall: + %t = call token @llvm.call.preallocated.setup(i32 1) + %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo) + %b = bitcast i8* %a to %Foo* +; CHECK: subl $8, %esp +; CHECK: calll _foo_stdcall_p@8 + call x86_stdcallcc void @foo_stdcall_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)] +; CHECK-NOT: %esp +; CHECK: pushl +; CHECK: calll _i@4 + call x86_stdcallcc void @i(i32 0) + ret void +} diff --git a/llvm/test/CodeGen/X86/shrink-wrap-chkstk.ll b/llvm/test/CodeGen/X86/shrink-wrap-chkstk.ll --- a/llvm/test/CodeGen/X86/shrink-wrap-chkstk.ll +++ b/llvm/test/CodeGen/X86/shrink-wrap-chkstk.ll @@ -1,5 +1,8 @@ ; RUN: llc < %s -enable-shrink-wrap=true | FileCheck %s +; TODO: add preallocated versions of tests +; we don't yet support conditionally called preallocated calls after the setup + ; chkstk cannot come before the usual prologue, since it adjusts ESP. ; If chkstk is used in the prologue, we also have to be careful about preserving ; EAX if it is used. diff --git a/llvm/test/CodeGen/X86/tail-call-mutable-memarg.ll b/llvm/test/CodeGen/X86/tail-call-mutable-memarg.ll --- a/llvm/test/CodeGen/X86/tail-call-mutable-memarg.ll +++ b/llvm/test/CodeGen/X86/tail-call-mutable-memarg.ll @@ -9,6 +9,21 @@ declare x86_stdcallcc void @tail_std(i32) declare void @capture(i32*) +define x86_thiscallcc void @preallocated(i32* %this, i32* preallocated(i32) %args) { +entry: + %val = load i32, i32* %args + store i32 0, i32* %args + tail call x86_stdcallcc void @tail_std(i32 %val) + ret void +} + +; CHECK-LABEL: _preallocated: # @preallocated +; CHECK: movl 4(%esp), %[[reg:[^ ]*]] +; CHECK: movl $0, 4(%esp) +; CHECK: pushl %[[reg]] +; CHECK: calll _tail_std@4 +; CHECK: retl $4 + define x86_thiscallcc void @inalloca(i32* %this, i32* inalloca %args) { entry: %val = load i32, i32* %args diff --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll --- a/llvm/test/Transforms/Attributor/value-simplify.ll +++ b/llvm/test/Transforms/Attributor/value-simplify.ll @@ -6,6 +6,8 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" declare void @f(i32) +declare token @llvm.call.preallocated.setup(i32) +declare i8* @llvm.call.preallocated.arg(token, i32) ; Test1: Replace argument with constant define internal void @test1(i32 %a) { @@ -284,6 +286,38 @@ ret i32* %call } +define internal i32* @test_preallocated(i32* preallocated(i32) %a) { +; IS__TUNIT____-LABEL: define {{[^@]+}}@test_preallocated +; IS__TUNIT____-SAME: (i32* noalias nofree returned writeonly preallocated(i32) align 536870912 "no-capture-maybe-returned" [[A:%.*]]) +; IS__TUNIT____-NEXT: ret i32* [[A]] +; +; IS__CGSCC____-LABEL: define {{[^@]+}}@test_preallocated +; IS__CGSCC____-SAME: (i32* noalias nofree returned writeonly preallocated(i32) "no-capture-maybe-returned" [[A:%.*]]) +; IS__CGSCC____-NEXT: ret i32* [[A]] +; + ret i32* %a +} +define i32* @complicated_args_preallocated() { +; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@complicated_args_preallocated() +; IS__TUNIT_OPM-NEXT: [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 1) +; IS__TUNIT_OPM-NEXT: [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree writeonly preallocated(i32) align 536870912 null) #5 [ "preallocated"(token [[C]]) ] +; IS__TUNIT_OPM-NEXT: ret i32* [[CALL]] +; +; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@complicated_args_preallocated() +; IS__TUNIT_NPM-NEXT: [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 1) +; IS__TUNIT_NPM-NEXT: [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree writeonly preallocated(i32) align 536870912 null) #4 [ "preallocated"(token [[C]]) ] +; IS__TUNIT_NPM-NEXT: ret i32* [[CALL]] +; +; IS__CGSCC____-LABEL: define {{[^@]+}}@complicated_args_preallocated() +; IS__CGSCC____-NEXT: [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 1) +; IS__CGSCC____-NEXT: [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree writeonly preallocated(i32) align 536870912 null) #6 [ "preallocated"(token [[C]]) ] +; IS__CGSCC____-NEXT: ret i32* [[CALL]] +; + %c = call token @llvm.call.preallocated.setup(i32 1) + %call = call i32* @test_preallocated(i32* preallocated(i32) null) ["preallocated"(token %c)] + ret i32* %call +} + define internal void @test_sret(%struct.X* sret %a, %struct.X** %b) { ; ; IS__TUNIT____-LABEL: define {{[^@]+}}@test_sret diff --git a/llvm/test/Transforms/DeadArgElim/keepalive.ll b/llvm/test/Transforms/DeadArgElim/keepalive.ll --- a/llvm/test/Transforms/DeadArgElim/keepalive.ll +++ b/llvm/test/Transforms/DeadArgElim/keepalive.ll @@ -1,5 +1,8 @@ ; RUN: opt < %s -deadargelim -S | FileCheck %s +declare token @llvm.call.preallocated.setup(i32) +declare i8* @llvm.call.preallocated.arg(token, i32) + %Ty = type <{ i32, i32 }> ; Check if the pass doesn't modify anything that doesn't need changing. We feed @@ -44,4 +47,22 @@ ret i32 %v } +; We can't remove 'this' here, as that would put argmem in ecx instead of +; memory. +define internal x86_thiscallcc i32 @unused_this_preallocated(i32* %this, i32* preallocated(i32) %argmem) { + %v = load i32, i32* %argmem + ret i32 %v +} +; CHECK-LABEL: define internal x86_thiscallcc i32 @unused_this_preallocated(i32* %this, i32* preallocated(i32) %argmem) + +define i32 @caller3() { + %t = alloca i32 + %c = call token @llvm.call.preallocated.setup(i32 1) + %M = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32) + %m = bitcast i8* %M to i32* + store i32 42, i32* %m + %v = call x86_thiscallcc i32 @unused_this_preallocated(i32* %t, i32* preallocated(i32) %m) ["preallocated"(token %c)] + ret i32 %v +} + ; CHECK: attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll @@ -58,6 +58,16 @@ ret void } +; Test for preallocated handling. +define void @test9_3(%struct.x* preallocated(%struct.x) %a) nounwind { +; CHECK-LABEL: @test9_3( +; CHECK-NEXT: ret void +; + %tmp2 = getelementptr %struct.x, %struct.x* %a, i32 0, i32 0 + store i32 1, i32* %tmp2, align 4 + ret void +} + ; DSE should delete the dead trampoline. declare void @test11f() define void @test11() { diff --git a/llvm/test/Transforms/DeadStoreElimination/simple.ll b/llvm/test/Transforms/DeadStoreElimination/simple.ll --- a/llvm/test/Transforms/DeadStoreElimination/simple.ll +++ b/llvm/test/Transforms/DeadStoreElimination/simple.ll @@ -169,6 +169,16 @@ ret void } +; Test for preallocated handling. +define void @test9_3(%struct.x* preallocated(%struct.x) %a) nounwind { +; CHECK-LABEL: @test9_3( +; CHECK-NEXT: ret void +; + %tmp2 = getelementptr %struct.x, %struct.x* %a, i32 0, i32 0 + store i32 1, i32* %tmp2, align 4 + ret void +} + ; va_arg has fuzzy dependence, the store shouldn't be zapped. define double @test10(i8* %X) { ; CHECK-LABEL: @test10( diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll --- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll +++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll @@ -56,6 +56,12 @@ ret void } +; CHECK: define void @test7_2(i32* nocapture preallocated(i32) %a) +; preallocated parameters are always considered written +define void @test7_2(i32* preallocated(i32) %a) { + ret void +} + ; CHECK: define i32* @test8_1(i32* readnone returned %p) define i32* @test8_1(i32* %p) { entry: diff --git a/llvm/test/Transforms/GlobalOpt/fastcc.ll b/llvm/test/Transforms/GlobalOpt/fastcc.ll --- a/llvm/test/Transforms/GlobalOpt/fastcc.ll +++ b/llvm/test/Transforms/GlobalOpt/fastcc.ll @@ -1,5 +1,8 @@ ; RUN: opt < %s -globalopt -S | FileCheck %s +declare token @llvm.call.preallocated.setup(i32) +declare i8* @llvm.call.preallocated.arg(token, i32) + define internal i32 @f(i32* %m) { ; CHECK-LABEL: define internal fastcc i32 @f %v = load i32, i32* %m @@ -32,6 +35,13 @@ ret i32 %rv } +define internal i32 @preallocated(i32* preallocated(i32) %p) { +; TODO: handle preallocated: +; CHECK-NOT-LABEL: define internal fastcc i32 @preallocated(i32* %p) + %rv = load i32, i32* %p + ret i32 %rv +} + define void @call_things() { %m = alloca i32 call i32 @f(i32* %m) @@ -40,6 +50,11 @@ call i32 @j(i32* %m) %args = alloca inalloca i32 call i32 @inalloca(i32* inalloca %args) + ; TODO: handle preallocated + ;%c = call token @llvm.call.preallocated.setup(i32 1) + ;%N = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32) + ;%n = bitcast i8* %N to i32* + ;call i32 @preallocated(i32* preallocated(i32) %n) ["preallocated"(token %c)] ret void } diff --git a/llvm/test/Transforms/InstCombine/call-cast-target-preallocated.ll b/llvm/test/Transforms/InstCombine/call-cast-target-preallocated.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/call-cast-target-preallocated.ll @@ -0,0 +1,28 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +target datalayout = "e-p:32:32" +target triple = "i686-pc-win32" + + +declare token @llvm.call.preallocated.setup(i32) +declare i8* @llvm.call.preallocated.arg(token, i32) + +declare void @takes_i32(i32) +declare void @takes_i32_preallocated(i32* preallocated(i32)) + +define void @f() { +; CHECK-LABEL: define void @f() + %t = call token @llvm.call.preallocated.setup(i32 1) + %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(i32) + %arg = bitcast i8* %a to i32* + call void bitcast (void (i32)* @takes_i32 to void (i32*)*)(i32* preallocated(i32) %arg) ["preallocated"(token %t)] +; CHECK: call void bitcast{{.*}}@takes_i32 + ret void +} + +define void @g() { +; CHECK-LABEL: define void @g() + call void bitcast (void (i32*)* @takes_i32_preallocated to void (i32)*)(i32 0) +; CHECK: call void bitcast{{.*}}@takes_i32_preallocated + ret void +}