diff --git a/llvm/lib/Target/PowerPC/PPCCCState.h b/llvm/lib/Target/PowerPC/PPCCCState.h --- a/llvm/lib/Target/PowerPC/PPCCCState.h +++ b/llvm/lib/Target/PowerPC/PPCCCState.h @@ -10,6 +10,7 @@ #define PPCCCSTATE_H #include "PPCISelLowering.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" @@ -36,6 +37,37 @@ bool WasOriginalArgPPCF128(unsigned ValNo) { return OriginalArgWasPPCF128[ValNo]; } void clearWasPPCF128() { OriginalArgWasPPCF128.clear(); } }; -} + +class AIXCCState : public CCState { +private: + BitVector IsFixed; + +public: + AIXCCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF, + SmallVectorImpl &Locs, LLVMContext &C) + : CCState(CC, IsVarArg, MF, Locs, C) {} + + void AnalyzeFormalArguments(const SmallVectorImpl &Ins, + CCAssignFn Fn) { + // All formal arguments are fixed. + IsFixed.resize(Ins.size(), true); + CCState::AnalyzeFormalArguments(Ins, Fn); + } + + void AnalyzeCallOperands(const SmallVectorImpl &Outs, + CCAssignFn Fn) { + // Record whether the call operand was a fixed argument. + IsFixed.resize(Outs.size(), false); + for (unsigned ValNo = 0, E = Outs.size(); ValNo != E; ++ValNo) + if (Outs[ValNo].IsFixed) + IsFixed.set(ValNo); + + CCState::AnalyzeCallOperands(Outs, Fn); + } + + bool isFixed(unsigned ValNo) { return IsFixed.test(ValNo); } +}; + +} // end namespace llvm #endif diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -6262,10 +6262,43 @@ Callee, SPDiff, NumBytes, Ins, InVals, CB); } +// Returns true when the shadow of a general purpose argument register +// in the parameter save area is aligned to at least 'RequiredAlign'. +static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) { + assert(RequiredAlign.value() <= 16 && + "Required alignment greater than stack alignment."); + switch (Reg) { + default: + report_fatal_error("called on invalid register."); + case PPC::R5: + case PPC::R9: + case PPC::X3: + case PPC::X5: + case PPC::X7: + case PPC::X9: + // These registers are 16 byte aligned which is the most strict aligment + // we can support. + return true; + case PPC::R3: + case PPC::R7: + case PPC::X4: + case PPC::X6: + case PPC::X8: + case PPC::X10: + // The shadow of these registers in the PSA is 8 byte aligned. + return RequiredAlign <= 8; + case PPC::R4: + case PPC::R6: + case PPC::R8: + case PPC::R10: + return RequiredAlign <= 4; + } +} + static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State) { - + CCState &S) { + AIXCCState &State = static_cast(S); const PPCSubtarget &Subtarget = static_cast( State.getMachineFunction().getSubtarget()); const bool IsPPC64 = Subtarget.isPPC64(); @@ -6397,18 +6430,97 @@ case MVT::v2i64: case MVT::v2f64: case MVT::v1i128: { - if (State.isVarArg()) - report_fatal_error( - "variadic arguments for vector types are unimplemented for AIX"); + const unsigned VecSize = 16; + const Align VecAlign(VecSize); + + if (!State.isVarArg()) { + // If there are vector registers remaining we don't consume any stack + // space. + if (unsigned VReg = State.AllocateReg(VR)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo)); + return false; + } + // Vectors passed on the stack do not shadow GPRs or FPRs even though they + // might be allocated in the portion of the PSA that is shadowed by the + // GPRs. + const unsigned Offset = State.AllocateStack(VecSize, VecAlign); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } - if (unsigned VReg = State.AllocateReg(VR)) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo)); + const unsigned PtrSize = IsPPC64 ? 8 : 4; + ArrayRef GPRs = IsPPC64 ? GPR_64 : GPR_32; + + unsigned NextRegIndex = State.getFirstUnallocated(GPRs); + // Burn any underaligned registers and their shadowed stack space until + // we reach the required alignment. + while (NextRegIndex != GPRs.size() && + !isGPRShadowAligned(GPRs[NextRegIndex], VecAlign)) { + // Shadow allocate register and its stack shadow. + unsigned Reg = State.AllocateReg(GPRs); + State.AllocateStack(PtrSize, PtrAlign); + assert(Reg && "Allocating register unexpectedly failed."); + (void)Reg; + NextRegIndex = State.getFirstUnallocated(GPRs); + } + + // Vectors that are passed as fixed arguments are handled differently. + // They are passed in VRs if any are available (unlike arguments passed + // through ellipses) and shadow GPRs (unlike arguments to non-vaarg + // functions) + if (State.isFixed(ValNo)) { + if (unsigned VReg = State.AllocateReg(VR)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo)); + // Shadow allocate GPRs and stack space even though we pass in a VR. + for (unsigned I = 0; I != VecSize; I += PtrSize) + State.AllocateReg(GPRs); + State.AllocateStack(VecSize, VecAlign); + return false; + } + // No vector registers remain so pass on the stack. + const unsigned Offset = State.AllocateStack(VecSize, VecAlign); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); return false; } - const unsigned VecSize = 16; - const unsigned Offset = State.AllocateStack(VecSize, Align(VecSize)); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + // If all GPRS are consumed then we pass the argument fully on the stack. + if (NextRegIndex == GPRs.size()) { + const unsigned Offset = State.AllocateStack(VecSize, VecAlign); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } + + // Corner case for 32-bit codegen. We have 2 registers to pass the first + // half of the argument, and then need to pass the remaining half on the + // stack. + if (GPRs[NextRegIndex] == PPC::R9) { + const unsigned Offset = State.AllocateStack(VecSize, VecAlign); + State.addLoc( + CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + + const unsigned FirstReg = State.AllocateReg(PPC::R9); + const unsigned SecondReg = State.AllocateReg(PPC::R10); + assert(FirstReg && SecondReg && + "Allocating R9 or R10 unexpectedly failed."); + State.addLoc( + CCValAssign::getCustomReg(ValNo, ValVT, FirstReg, RegVT, LocInfo)); + State.addLoc( + CCValAssign::getCustomReg(ValNo, ValVT, SecondReg, RegVT, LocInfo)); + return false; + } + + // We have enough GPRs to fully pass the vector argument, and we have + // already consumed any underaligned registers. Start with the custom + // MemLoc and then the custom RegLocs. + const unsigned Offset = State.AllocateStack(VecSize, VecAlign); + State.addLoc( + CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + for (unsigned I = 0; I != VecSize; I += PtrSize) { + const unsigned Reg = State.AllocateReg(GPRs); + assert(Reg && "Failed to allocated register for vararg vector argument"); + State.addLoc( + CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo)); + } return false; } } @@ -6542,7 +6654,7 @@ MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); PPCFunctionInfo *FuncInfo = MF.getInfo(); - CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); + AIXCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); const EVT PtrVT = getPointerTy(MF.getDataLayout()); // Reserve space for the linkage area on the stack. @@ -6555,6 +6667,7 @@ for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) { CCValAssign &VA = ArgLocs[I++]; MVT LocVT = VA.getLocVT(); + MVT ValVT = VA.getValVT(); ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags; // For compatibility with the AIX XL compiler, the float args in the // parameter save area are initialized even if the argument is available @@ -6562,9 +6675,46 @@ // and memory, however, the callee can choose to expect it in either. // The memloc is dismissed here because the argument is retrieved from // the register. - if (VA.isMemLoc() && VA.needsCustom()) + if (VA.isMemLoc() && VA.needsCustom() && ValVT.isFloatingPoint()) continue; + auto HandleMemLoc = [&]() { + const unsigned LocSize = LocVT.getStoreSize(); + const unsigned ValSize = ValVT.getStoreSize(); + assert((ValSize <= LocSize) && + "Object size is larger than size of MemLoc"); + int CurArgOffset = VA.getLocMemOffset(); + // Objects are right-justified because AIX is big-endian. + if (LocSize > ValSize) + CurArgOffset += LocSize - ValSize; + // Potential tail calls could cause overwriting of argument stack slots. + const bool IsImmutable = + !(getTargetMachine().Options.GuaranteedTailCallOpt && + (CallConv == CallingConv::Fast)); + int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + SDValue ArgValue = + DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo()); + InVals.push_back(ArgValue); + }; + + // Vector arguments to VaArg functions are passed both on the stack, and + // in any available GPRs. Load the value from the stack and add the GPRs + // as live ins. + if (VA.isMemLoc() && VA.needsCustom()) { + assert(ValVT.isVector() && "Unexpected Custom MemLoc type."); + assert(isVarArg && "Only use custom memloc for vararg."); + HandleMemLoc(); + while (I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom()) { + VA = ArgLocs[I++]; + assert(VA.getValVT().isVector() && + "Unexpected Val type for custom RegLoc."); + MVT::SimpleValueType SVT = VA.getLocVT().SimpleTy; + MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64)); + } + continue; + } + if (VA.isRegLoc()) { if (VA.getValVT().isScalarInteger()) FuncInfo->appendParameterType(PPCFunctionInfo::FixedType); @@ -6652,9 +6802,8 @@ continue; } - EVT ValVT = VA.getValVT(); if (VA.isRegLoc() && !VA.needsCustom()) { - MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy; + MVT::SimpleValueType SVT = ValVT.SimpleTy; unsigned VReg = MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64)); SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT); @@ -6667,23 +6816,7 @@ continue; } if (VA.isMemLoc()) { - const unsigned LocSize = LocVT.getStoreSize(); - const unsigned ValSize = ValVT.getStoreSize(); - assert((ValSize <= LocSize) && - "Object size is larger than size of MemLoc"); - int CurArgOffset = VA.getLocMemOffset(); - // Objects are right-justified because AIX is big-endian. - if (LocSize > ValSize) - CurArgOffset += LocSize - ValSize; - // Potential tail calls could cause overwriting of argument stack slots. - const bool IsImmutable = - !(getTargetMachine().Options.GuaranteedTailCallOpt && - (CallConv == CallingConv::Fast)); - int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - SDValue ArgValue = - DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo()); - InVals.push_back(ArgValue); + HandleMemLoc(); continue; } } @@ -6764,8 +6897,8 @@ MachineFunction &MF = DAG.getMachineFunction(); SmallVector ArgLocs; - CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs, - *DAG.getContext()); + AIXCCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs, + *DAG.getContext()); // Reserve space for the linkage save area (LSA) on the stack. // In both PPC32 and PPC64 there are 6 reserved slots in the LSA: @@ -6934,11 +7067,15 @@ continue; } + if (!ValVT.isFloatingPoint()) + report_fatal_error( + "Unexpected register handling for calling convention."); + // Custom handling is used for GPR initializations for vararg float // arguments. assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg && - ValVT.isFloatingPoint() && LocVT.isInteger() && - "Unexpected register handling for calling convention."); + LocVT.isInteger() && + "Custom register handling only expected for VarArg."); SDValue ArgAsInt = DAG.getBitcast(MVT::getIntegerVT(ValVT.getSizeInBits()), Arg); diff --git a/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-callee-split.ll b/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-callee-split.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-callee-split.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -verify-machineinstrs -stop-before=ppc-vsx-copy -vec-extabi \ +; RUN: -mcpu=pwr7 -mtriple powerpc-ibm-aix-xcoff < %s | \ +; RUN: FileCheck %s + +define <4 x i32> @split_spill(double %d1, double %d2, double %d3, ...) { + ; CHECK-LABEL: name: split_spill + ; CHECK: bb.0.entry: + ; CHECK: liveins: $r9, $r10 + ; CHECK: [[COPY:%[0-9]+]]:gprc = COPY $r10 + ; CHECK: [[COPY1:%[0-9]+]]:gprc = COPY $r9 + ; CHECK: STW [[COPY1]], 0, %fixed-stack.0 :: (store 4 into %fixed-stack.0, align 16) + ; CHECK: STW [[COPY]], 4, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 4) + ; CHECK: LIFETIME_START %stack.0.arg_list + ; CHECK: [[ADDI:%[0-9]+]]:gprc = ADDI %fixed-stack.0, 0 + ; CHECK: [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[ADDI]] :: (load 16 from %ir.4) + ; CHECK: LIFETIME_END %stack.0.arg_list + ; CHECK: $v2 = COPY [[LXVW4X]] + ; CHECK: BLR implicit $lr, implicit $rm, implicit $v2 +entry: + %arg_list = alloca i8*, align 4 + %0 = bitcast i8** %arg_list to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) + call void @llvm.va_start(i8* nonnull %0) + %argp.cur = load i8*, i8** %arg_list, align 4 + %1 = ptrtoint i8* %argp.cur to i32 + %2 = add i32 %1, 15 + %3 = and i32 %2, -16 + %argp.cur.aligned = inttoptr i32 %3 to i8* + %argp.next = getelementptr inbounds i8, i8* %argp.cur.aligned, i32 16 + store i8* %argp.next, i8** %arg_list, align 4 + %4 = inttoptr i32 %3 to <4 x i32>* + %5 = load <4 x i32>, <4 x i32>* %4, align 16 + call void @llvm.va_end(i8* nonnull %0) + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) + ret <4 x i32> %5 +} + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) + +declare void @llvm.va_start(i8*) + +declare void @llvm.va_end(i8*) + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) diff --git a/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-callee.ll b/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-callee.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-callee.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -verify-machineinstrs -stop-before=ppc-vsx-copy -vec-extabi \ +; RUN: -mcpu=pwr7 -mtriple powerpc-ibm-aix-xcoff < %s | \ +; RUN: FileCheck %s + +define <4 x i32> @callee(i32 %count, ...) { + ; CHECK-LABEL: name: callee + ; CHECK: bb.0.entry: + ; CHECK: liveins: $r4, $r5, $r6, $r7, $r8, $r9, $r10 + ; CHECK: [[COPY:%[0-9]+]]:gprc = COPY $r10 + ; CHECK: [[COPY1:%[0-9]+]]:gprc = COPY $r9 + ; CHECK: [[COPY2:%[0-9]+]]:gprc = COPY $r8 + ; CHECK: [[COPY3:%[0-9]+]]:gprc = COPY $r7 + ; CHECK: [[COPY4:%[0-9]+]]:gprc = COPY $r6 + ; CHECK: [[COPY5:%[0-9]+]]:gprc = COPY $r5 + ; CHECK: [[COPY6:%[0-9]+]]:gprc = COPY $r4 + ; CHECK: STW [[COPY6]], 0, %fixed-stack.0 :: (store 4 into %fixed-stack.0) + ; CHECK: STW [[COPY5]], 4, %fixed-stack.0 :: (store 4 into %fixed-stack.0 + 4) + ; CHECK: STW [[COPY4]], 8, %fixed-stack.0 :: (store 4) + ; CHECK: STW [[COPY3]], 12, %fixed-stack.0 :: (store 4) + ; CHECK: STW [[COPY2]], 16, %fixed-stack.0 :: (store 4) + ; CHECK: STW [[COPY1]], 20, %fixed-stack.0 :: (store 4) + ; CHECK: STW [[COPY]], 24, %fixed-stack.0 :: (store 4) + ; CHECK: LIFETIME_START %stack.0.arg_list + ; CHECK: [[ADDI:%[0-9]+]]:gprc = ADDI %fixed-stack.0, 0 + ; CHECK: STW killed [[ADDI]], 0, %stack.0.arg_list :: (store 4 into %ir.0) + ; CHECK: [[ADDI1:%[0-9]+]]:gprc = ADDI %fixed-stack.0, 15 + ; CHECK: [[RLWINM:%[0-9]+]]:gprc = RLWINM killed [[ADDI1]], 0, 0, 27 + ; CHECK: [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[RLWINM]] :: (load 16 from %ir.4) + ; CHECK: LIFETIME_END %stack.0.arg_list + ; CHECK: $v2 = COPY [[LXVW4X]] + ; CHECK: BLR implicit $lr, implicit $rm, implicit $v2 +entry: + %arg_list = alloca i8*, align 4 + %0 = bitcast i8** %arg_list to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) + call void @llvm.va_start(i8* nonnull %0) + %argp.cur = load i8*, i8** %arg_list, align 4 + %1 = ptrtoint i8* %argp.cur to i32 + %2 = add i32 %1, 15 + %3 = and i32 %2, -16 + %argp.cur.aligned = inttoptr i32 %3 to i8* + %argp.next = getelementptr inbounds i8, i8* %argp.cur.aligned, i32 16 + store i8* %argp.next, i8** %arg_list, align 4 + %4 = inttoptr i32 %3 to <4 x i32>* + %5 = load <4 x i32>, <4 x i32>* %4, align 16 + call void @llvm.va_end(i8* nonnull %0) + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) + ret <4 x i32> %5 +} + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) + +declare void @llvm.va_start(i8*) + +declare void @llvm.va_end(i8*) + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + diff --git a/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-caller-split.ll b/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-caller-split.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-caller-split.ll @@ -0,0 +1,13 @@ +; RUN: not --crash llc -verify-machineinstrs -stop-before=ppc-vsx-copy -vec-extabi \ +; RUN: -mcpu=pwr7 -mtriple powerpc-ibm-aix-xcoff < %s 2>&1 | \ +; RUN: FileCheck %s + +define void @caller() { +entry: + %call = tail call <4 x i32> (double, double, double, ...) @split_spill(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, <4 x i32> ) + ret void +} + +declare <4 x i32> @split_spill(double, double, double, ...) + +; CHECK: ERROR: Unexpected register handling for calling convention. diff --git a/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-fixed-callee.ll b/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-fixed-callee.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix32-vector-vararg-fixed-callee.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -verify-machineinstrs -stop-before=ppc-vsx-copy -vec-extabi \ +; RUN: -mcpu=pwr7 -mtriple powerpc-ibm-aix-xcoff < %s | \ +; RUN: FileCheck %s + +define double @callee(i32 %count, <4 x i32> %vsi, double %next, ...) { + ; CHECK-LABEL: name: callee + ; CHECK: bb.0.entry: + ; CHECK: LIFETIME_START %stack.0.arg_list + ; CHECK: [[ADDI:%[0-9]+]]:gprc = ADDI %fixed-stack.0, 0 + ; CHECK: STW killed [[ADDI]], 0, %stack.0.arg_list :: (store 4 into %ir.0) + ; CHECK: [[ADDI1:%[0-9]+]]:gprc = ADDI %fixed-stack.0, 15 + ; CHECK: [[RLWINM:%[0-9]+]]:gprc_and_gprc_nor0 = RLWINM killed [[ADDI1]], 0, 0, 27 + ; CHECK: [[ADDI2:%[0-9]+]]:gprc = nuw ADDI killed [[RLWINM]], 16 + ; CHECK: [[XFLOADf64_:%[0-9]+]]:vsfrc = XFLOADf64 $zero, killed [[ADDI2]] :: (load 8 from %ir.4, align 16) + ; CHECK: LIFETIME_END %stack.0.arg_list + ; CHECK: $f1 = COPY [[XFLOADf64_]] + ; CHECK: BLR implicit $lr, implicit $rm, implicit $f1 +entry: + %arg_list = alloca i8*, align 4 + %0 = bitcast i8** %arg_list to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) + call void @llvm.va_start(i8* nonnull %0) + %argp.cur = load i8*, i8** %arg_list, align 4 + %1 = ptrtoint i8* %argp.cur to i32 + %2 = add i32 %1, 15 + %3 = and i32 %2, -16 + %argp.cur.aligned = inttoptr i32 %3 to i8* + %argp.next = getelementptr inbounds i8, i8* %argp.cur.aligned, i32 16 + %argp.next3 = getelementptr inbounds i8, i8* %argp.cur.aligned, i32 24 + store i8* %argp.next3, i8** %arg_list, align 4 + %4 = bitcast i8* %argp.next to double* + %5 = load double, double* %4, align 16 + call void @llvm.va_end(i8* nonnull %0) + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) + ret double %5 +} + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) + +declare void @llvm.va_start(i8*) + +declare void @llvm.va_end(i8*) + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + diff --git a/llvm/test/CodeGen/PowerPC/aix64-vector-vararg-callee.ll b/llvm/test/CodeGen/PowerPC/aix64-vector-vararg-callee.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix64-vector-vararg-callee.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -verify-machineinstrs -stop-before=ppc-vsx-copy -vec-extabi \ +; RUN: -mcpu=pwr7 -mtriple powerpc64-ibm-aix-xcoff < %s | \ +; RUN: FileCheck %s + +define <4 x i32> @callee(i32 signext %count, ...) { + ; CHECK-LABEL: name: callee + ; CHECK: bb.0.entry: + ; CHECK: liveins: $x4, $x5, $x6, $x7, $x8, $x9, $x10 + ; CHECK: [[COPY:%[0-9]+]]:g8rc = COPY $x10 + ; CHECK: [[COPY1:%[0-9]+]]:g8rc = COPY $x9 + ; CHECK: [[COPY2:%[0-9]+]]:g8rc = COPY $x8 + ; CHECK: [[COPY3:%[0-9]+]]:g8rc = COPY $x7 + ; CHECK: [[COPY4:%[0-9]+]]:g8rc = COPY $x6 + ; CHECK: [[COPY5:%[0-9]+]]:g8rc = COPY $x5 + ; CHECK: [[COPY6:%[0-9]+]]:g8rc = COPY $x4 + ; CHECK: STD [[COPY6]], 0, %fixed-stack.0 :: (store 8 into %fixed-stack.0) + ; CHECK: STD [[COPY5]], 8, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 8) + ; CHECK: STD [[COPY4]], 16, %fixed-stack.0 :: (store 8) + ; CHECK: STD [[COPY3]], 24, %fixed-stack.0 :: (store 8) + ; CHECK: STD [[COPY2]], 32, %fixed-stack.0 :: (store 8) + ; CHECK: STD [[COPY1]], 40, %fixed-stack.0 :: (store 8) + ; CHECK: STD [[COPY]], 48, %fixed-stack.0 :: (store 8) + ; CHECK: LIFETIME_START %stack.0.arg_list + ; CHECK: [[ADDI8_:%[0-9]+]]:g8rc = ADDI8 %fixed-stack.0, 0 + ; CHECK: STD killed [[ADDI8_]], 0, %stack.0.arg_list :: (store 8 into %ir.0) + ; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 %fixed-stack.0, 15 + ; CHECK: [[RLDICR:%[0-9]+]]:g8rc = RLDICR killed [[ADDI8_1]], 0, 59 + ; CHECK: [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[RLDICR]] :: (load 16 from %ir.4) + ; CHECK: LIFETIME_END %stack.0.arg_list + ; CHECK: $v2 = COPY [[LXVW4X]] + ; CHECK: BLR8 implicit $lr8, implicit $rm, implicit $v2 +entry: + %arg_list = alloca i8*, align 8 + %0 = bitcast i8** %arg_list to i8* + call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) + call void @llvm.va_start(i8* nonnull %0) + %argp.cur = load i8*, i8** %arg_list, align 8 + %1 = ptrtoint i8* %argp.cur to i64 + %2 = add i64 %1, 15 + %3 = and i64 %2, -16 + %argp.cur.aligned = inttoptr i64 %3 to i8* + %argp.next = getelementptr inbounds i8, i8* %argp.cur.aligned, i64 16 + store i8* %argp.next, i8** %arg_list, align 8 + %4 = inttoptr i64 %3 to <4 x i32>* + %5 = load <4 x i32>, <4 x i32>* %4, align 16 + call void @llvm.va_end(i8* nonnull %0) + call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) + ret <4 x i32> %5 +} + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) + +declare void @llvm.va_start(i8*) + +declare void @llvm.va_end(i8*) + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + diff --git a/llvm/test/CodeGen/PowerPC/aix64-vector-vararg-fixed-callee.ll b/llvm/test/CodeGen/PowerPC/aix64-vector-vararg-fixed-callee.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/aix64-vector-vararg-fixed-callee.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -verify-machineinstrs -stop-before=ppc-vsx-copy -vec-extabi \ +; RUN: -mcpu=pwr7 -mtriple powerpc64-ibm-aix-xcoff < %s | \ +; RUN: FileCheck %s + +define double @callee(i32 signext %count, <4 x i32> %vsi, double %next, ...) { + ; CHECK-LABEL: name: callee + ; CHECK: bb.0.entry: + ; CHECK: liveins: $x8, $x9, $x10 + ; CHECK: [[COPY:%[0-9]+]]:g8rc = COPY $x10 + ; CHECK: [[COPY1:%[0-9]+]]:g8rc = COPY $x9 + ; CHECK: [[COPY2:%[0-9]+]]:g8rc = COPY $x8 + ; CHECK: STD [[COPY2]], 0, %fixed-stack.0 :: (store 8 into %fixed-stack.0) + ; CHECK: STD [[COPY1]], 8, %fixed-stack.0 :: (store 8 into %fixed-stack.0 + 8) + ; CHECK: STD [[COPY]], 16, %fixed-stack.0 :: (store 8) + ; CHECK: LIFETIME_START %stack.0.arg_list + ; CHECK: [[ADDI8_:%[0-9]+]]:g8rc = ADDI8 %fixed-stack.0, 0 + ; CHECK: STD killed [[ADDI8_]], 0, %stack.0.arg_list :: (store 8 into %ir.0) + ; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 %fixed-stack.0, 15 + ; CHECK: [[RLDICR:%[0-9]+]]:g8rc_and_g8rc_nox0 = RLDICR killed [[ADDI8_1]], 0, 59 + ; CHECK: [[LI8_:%[0-9]+]]:g8rc = LI8 16 + ; CHECK: [[XFLOADf64_:%[0-9]+]]:vsfrc = XFLOADf64 killed [[RLDICR]], killed [[LI8_]] :: (load 8 from %ir.4, align 16) + ; CHECK: LIFETIME_END %stack.0.arg_list + ; CHECK: $f1 = COPY [[XFLOADf64_]] + ; CHECK: BLR8 implicit $lr8, implicit $rm, implicit $f1 +entry: + %arg_list = alloca i8*, align 8 + %0 = bitcast i8** %arg_list to i8* + call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) + call void @llvm.va_start(i8* nonnull %0) + %argp.cur = load i8*, i8** %arg_list, align 8 + %1 = ptrtoint i8* %argp.cur to i64 + %2 = add i64 %1, 15 + %3 = and i64 %2, -16 + %argp.cur.aligned = inttoptr i64 %3 to i8* + %argp.next = getelementptr inbounds i8, i8* %argp.cur.aligned, i64 16 + %argp.next3 = getelementptr inbounds i8, i8* %argp.cur.aligned, i64 24 + store i8* %argp.next3, i8** %arg_list, align 8 + %4 = bitcast i8* %argp.next to double* + %5 = load double, double* %4, align 16 + call void @llvm.va_end(i8* nonnull %0) + call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) + ret double %5 +} + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) + +declare void @llvm.va_start(i8*) + +declare void @llvm.va_end(i8*) + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)