Index: lib/Target/X86/CMakeLists.txt =================================================================== --- lib/Target/X86/CMakeLists.txt +++ lib/Target/X86/CMakeLists.txt @@ -40,6 +40,7 @@ X86VZeroUpper.cpp X86WinAllocaExpander.cpp X86WinEHState.cpp + X86CallingConv.cpp ) add_llvm_target(X86CodeGen ${sources}) Index: lib/Target/X86/X86CallingConv.h =================================================================== --- lib/Target/X86/X86CallingConv.h +++ lib/Target/X86/X86CallingConv.h @@ -21,6 +21,14 @@ namespace llvm { +/// When regcall calling convention compiled to 32 bit arch, special treatment +/// is required for 64 bit masks. +/// The value should be assigned to two GPRs. +/// @return true if registers were allocated and false otherwise +bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State); + inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, Index: lib/Target/X86/X86CallingConv.cpp =================================================================== --- lib/Target/X86/X86CallingConv.cpp +++ lib/Target/X86/X86CallingConv.cpp @@ -0,0 +1,60 @@ +//=== X86CallingConv.cpp - X86 Custom Calling Convention Impl -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of custom routines for the X86 +// Calling Convention that aren't done by tablegen. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/IR/CallingConv.h" + +namespace llvm { + +bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + // List of GPR registers that are available to store values in regcall + // calling convention. + static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI, + X86::ESI}; + + // The vector will save all the available registers for allocation. + SmallVector AvailableRegs; + + // searching for the available registers. + for (auto Reg : RegList) { + if (!State.isAllocated(Reg)) + AvailableRegs.push_back(Reg); + } + + const size_t RequiredGprsUponSplit = 2; + if (AvailableRegs.size() < RequiredGprsUponSplit) + return false; // Not enough free registers - continue the search. + + // Allocating the available registers + for (unsigned I = 0; I < RequiredGprsUponSplit; I++) { + + // Marking the register as located + unsigned Reg = State.AllocateReg(AvailableRegs[I]); + + // Since we previously made sure that 2 registers are available + // we expect that a real register number will be returned + assert(Reg && "Expecting a register will be available"); + + // Assign the value to the allocated register + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + } + + // Successful in allocating regsiters - stop scanning next rules. + return true; +} + +} // End llvm namespace Index: lib/Target/X86/X86CallingConv.td =================================================================== --- lib/Target/X86/X86CallingConv.td +++ lib/Target/X86/X86CallingConv.td @@ -77,14 +77,19 @@ // bool, char, int, enum, long, pointer --> GPR CCIfType<[i32], CCAssignToReg>, - // TODO: Handle the case of mask types (v*i1) - // TODO: Handle the case of 32 bit machine with v64i1 argument - // (split to 2 registers) - CCIfType<[v8i1, v16i1, v32i1, v64i1], CCCustom<"CC_X86_RegCall_Error">>, - // long long, __int64 --> GPR CCIfType<[i64], CCAssignToReg>, + // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32) + CCIfType<[v64i1], CCPromoteToType>, + CCIfSubtarget<"is64Bit()", CCIfType<[i64], + CCAssignToReg>>, + CCIfSubtarget<"is32Bit()", CCIfType<[i64], + CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>, + + // TODO: Handle the case of mask types (v*i1) + CCIfType<[v8i1, v16i1, v32i1], CCCustom<"CC_X86_RegCall_Error">>, + // TODO: Handle the case of long double (f80) CCIfType<[f80], CCCustom<"CC_X86_RegCall_Error">>, @@ -116,7 +121,7 @@ // In 32 bit, assign 64/32 bit values to 8/4 byte stack CCIfType<[i32, f32], CCAssignToStack<4, 4>>, - CCIfType<[f64], CCAssignToStack<8, 4>>, + CCIfType<[i64, f64], CCAssignToStack<8, 4>>, // MMX type gets 8 byte slot in stack , while alignment depends on target CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>, @@ -147,14 +152,19 @@ CCIfType<[i16], CCAssignToReg>, CCIfType<[i32], CCAssignToReg>, - // TODO: Handle the case of mask types (v*i1) - // TODO: Handle the case of 32 bit machine with v64i1 argument - // (split to 2 registers) - CCIfType<[v8i1, v16i1, v32i1, v64i1], CCCustom<"CC_X86_RegCall_Error">>, - // long long, __int64 --> GPR CCIfType<[i64], CCAssignToReg>, + // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32) + CCIfType<[v64i1], CCPromoteToType>, + CCIfSubtarget<"is64Bit()", CCIfType<[i64], + CCAssignToReg>>, + CCIfSubtarget<"is32Bit()", CCIfType<[i64], + CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>, + + // TODO: Handle the case of mask types (v*i1) + CCIfType<[v8i1, v16i1, v32i1], CCCustom<"CC_X86_RegCall_Error">>, + // long double --> FP CCIfType<[f80], CCAssignToReg<[FP0]>>, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -2086,6 +2086,46 @@ return ScratchRegs; } +/// Lowers masks values (v*i1) to the local register values +static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, + const SDLoc &Dl, SelectionDAG &DAG) { + EVT ValVT = ValArg.getValueType(); + + if (ValVT == MVT::v64i1 && ValLoc == MVT::i64) { + // One stage lowering is required + // bitcast: v64i1 -> i64 + return DAG.getBitcast(MVT::i64, ValArg); + } else + return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg); +} + +/// Breaks v64i1 value into two registers and adds the new node to the DAG +static void Passv64i1ArgInRegs( + const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, + SmallVector, 8> &RegsToPass, CCValAssign &VA, + CCValAssign &NextVA, const X86Subtarget &Subtarget) { + assert((Subtarget.hasBWI() || Subtarget.hasBMI()) && + "Expected AVX512BW or AVX512BMI target!"); + assert(Subtarget.is32Bit() && "Expecting 32 bit target"); + assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); + assert(VA.isRegLoc() && NextVA.isRegLoc() && + "The value should reside in two registers"); + + // Before splitting the value we cast it to i64 + Arg = DAG.getBitcast(MVT::i64, Arg); + + // Splitting the value into two i32 types + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, + DAG.getConstant(0, Dl, MVT::i32)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg, + DAG.getConstant(1, Dl, MVT::i32)); + + // Attach the two i32 types into corresponding registers + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); + RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); +} + SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -2110,10 +2150,11 @@ MVT::i32)); // Copy the result values into the output registers. - for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { - CCValAssign &VA = RVLocs[i]; + for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; + ++I, ++OutsIndex) { + CCValAssign &VA = RVLocs[I]; assert(VA.isRegLoc() && "Can only return in registers!"); - SDValue ValToCopy = OutVals[i]; + SDValue ValToCopy = OutVals[OutsIndex]; EVT ValVT = ValToCopy.getValueType(); // Promote values to the appropriate types. @@ -2123,7 +2164,7 @@ ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) { if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) - ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); + ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); else ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); } @@ -2176,9 +2217,27 @@ } } - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); - Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + SmallVector, 8> RegsToPass; + + if (VA.needsCustom()) { + assert(VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); + + Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I], + Subtarget); + + assert(2 == RegsToPass.size() && + "Expecting two registers after Pass64BitArgInRegs"); + } else { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); + } + + // Add nodes to the DAG and add the values into the RetOps list + for (auto &Reg : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); + } } // Swift calling convention does not require we copy the sret argument @@ -2306,6 +2365,83 @@ return VT.bitsLT(MinVT) ? MinVT : VT; } +/// Reads two 32 bit registers and creates a 64 bit mask value. +/// @param VA The current 32 bit value that need to be assigned. +/// @param NextVA The next 32 bit value that need to be assigned. +/// @param Root The parent DAG note +/// @param [inout] InFlag Represents SDvalue in the parent DAG node for +/// glue purposes. In the case the DAG is already using +/// physical register instead of virtual, we should glue +/// our new SDValue to InFlag SDvalue. +/// @return a new SDvalue of size 64bit. +static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, + SDValue &Root, SelectionDAG &DAG, + const SDLoc &Dl, const X86Subtarget &Subtarget, + SDValue *InFlag = nullptr) { + assert((Subtarget.hasBWI()) && "Expected AVX512BW target!"); + assert(Subtarget.is32Bit() && "Expecting 32 bit target"); + assert(VA.getValVT() == MVT::v64i1 && + "Expecting first location of 64 bit width type"); + assert(NextVA.getValVT() == VA.getValVT() && + "The locations should have the same type"); + assert(VA.isRegLoc() && NextVA.isRegLoc() && + "The values should reside in two registers"); + + SDValue Lo, Hi; + unsigned Reg; + SDValue ArgValueLo, ArgValueHi; + + MachineFunction &MF = DAG.getMachineFunction(); + const TargetRegisterClass *RC = &X86::GR32RegClass; + + // Read a 32 bit value from the registers + if (nullptr == InFlag) { + // When no physical register is present, + // create an intermediate virtual register + Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); + Reg = MF.addLiveIn(NextVA.getLocReg(), RC); + ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); + } else { + // When a physical register is available read the value from it and glue + // the reads together. + ArgValueLo = + DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag); + *InFlag = ArgValueLo.getValue(2); + ArgValueHi = + DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag); + *InFlag = ArgValueHi.getValue(2); + } + + // Convert the i32 type into v32i1 type + Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); + + // Convert the i32 type into v32i1 type + Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); + + // Concantenate the two values together + return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); +} + +static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, + const EVT &ValLoc, const SDLoc &Dl, + SelectionDAG &DAG) { + assert((ValLoc == MVT::i64 || ValLoc == MVT::i32) && + "Expecting register location of size 32/64 bit"); + + // Currently not referenced - will be used in other mask lowering + (void)Dl; + + // In the case of v64i1 no special handling is required due to two reasons: + // In 32 bit machine, this case is handled by getv64i1Argument + // In 64 bit machine, There is no need to truncate the value only bitcast + if (ValVT == MVT::v64i1 && ValLoc == MVT::i32) { + llvm_unreachable("Expecting only i64 locations"); + } + + return DAG.getBitcast(ValVT, ValArg); +} + /// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// @@ -2322,13 +2458,14 @@ CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. - for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { - CCValAssign &VA = RVLocs[i]; + for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; + ++I, ++InsIndex) { + CCValAssign &VA = RVLocs[I]; EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && - ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) { + ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -2343,19 +2480,33 @@ RoundAfterCopy = (CopyVT != VA.getLocVT()); } - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), - CopyVT, InFlag).getValue(1); - SDValue Val = Chain.getValue(0); + SDValue Val; + if (VA.needsCustom()) { + assert(VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); + Val = + getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag); + } else { + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag) + .getValue(1); + Val = Chain.getValue(0); + InFlag = Chain.getValue(2); + } if (RoundAfterCopy) Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1, dl)); - if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1) - Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) { + if (VA.getValVT().isVector() && + (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::i64)) { + // promoting a mask type (v*i1) into a register of type i64/i32 + Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); + } else + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + } - InFlag = Chain.getValue(2); InVals.push_back(Val); } @@ -2423,7 +2574,8 @@ /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || - CC == CallingConv::HiPE || CC == CallingConv::HHVM); + CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || + CC == CallingConv::HHVM); } /// Return true if we might ever do TCO for calls with this calling convention. @@ -2478,9 +2630,11 @@ EVT ValVT; // If value is passed by pointer we have address passed instead of the value - // itself. - bool ExtendedInMem = VA.isExtInLoc() && - VA.getValVT().getScalarType() == MVT::i1; + // itself. No need to extend if the mask value and location share the same + // absolute size. + bool ExtendedInMem = + VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && + VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) ValVT = VA.getLocVT(); @@ -2604,8 +2758,9 @@ bool Is64Bit = Subtarget.is64Bit(); bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); - assert(!(isVarArg && canGuaranteeTCO(CallConv)) && - "Var args not supported with calling convention fastcc, ghc or hipe"); + assert( + !(isVarArg && canGuaranteeTCO(CallConv)) && + "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); if (CallConv == CallingConv::X86_INTR) { bool isLegal = Ins.size() == 1 || @@ -2625,54 +2780,60 @@ CCInfo.AnalyzeFormalArguments(Ins, CC_X86); - unsigned LastVal = ~0U; SDValue ArgValue; - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - // TODO: If an arg is passed in two places (e.g. reg and stack), skip later - // places. - assert(VA.getValNo() != LastVal && - "Don't support value assigned to multiple locs yet"); - (void)LastVal; - LastVal = VA.getValNo(); + for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; + ++I, ++InsIndex) { + assert(InsIndex < Ins.size() && "Invalid Ins index"); + CCValAssign &VA = ArgLocs[I]; if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); - const TargetRegisterClass *RC; - if (RegVT == MVT::i32) - RC = &X86::GR32RegClass; - else if (Is64Bit && RegVT == MVT::i64) - RC = &X86::GR64RegClass; - else if (RegVT == MVT::f32) - RC = &X86::FR32RegClass; - else if (RegVT == MVT::f64) - RC = &X86::FR64RegClass; - else if (RegVT == MVT::f128) - RC = &X86::FR128RegClass; - else if (RegVT.is512BitVector()) - RC = &X86::VR512RegClass; - else if (RegVT.is256BitVector()) - RC = &X86::VR256RegClass; - else if (RegVT.is128BitVector()) - RC = &X86::VR128RegClass; - else if (RegVT == MVT::x86mmx) - RC = &X86::VR64RegClass; - else if (RegVT == MVT::i1) - RC = &X86::VK1RegClass; - else if (RegVT == MVT::v8i1) - RC = &X86::VK8RegClass; - else if (RegVT == MVT::v16i1) - RC = &X86::VK16RegClass; - else if (RegVT == MVT::v32i1) - RC = &X86::VK32RegClass; - else if (RegVT == MVT::v64i1) - RC = &X86::VK64RegClass; - else - llvm_unreachable("Unknown argument type!"); + if (VA.needsCustom()) { + assert( + VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); - ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + // v64i1 values, in regcall calling convention, that are + // compiled to 32 bit arch, are splited up into two registers. + ArgValue = + getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); + } else { + const TargetRegisterClass *RC; + if (RegVT == MVT::i32) + RC = &X86::GR32RegClass; + else if (Is64Bit && RegVT == MVT::i64) + RC = &X86::GR64RegClass; + else if (RegVT == MVT::f32) + RC = &X86::FR32RegClass; + else if (RegVT == MVT::f64) + RC = &X86::FR64RegClass; + else if (RegVT == MVT::f128) + RC = &X86::FR128RegClass; + else if (RegVT.is512BitVector()) + RC = &X86::VR512RegClass; + else if (RegVT.is256BitVector()) + RC = &X86::VR256RegClass; + else if (RegVT.is128BitVector()) + RC = &X86::VR128RegClass; + else if (RegVT == MVT::x86mmx) + RC = &X86::VR64RegClass; + else if (RegVT == MVT::i1) + RC = &X86::VK1RegClass; + else if (RegVT == MVT::v8i1) + RC = &X86::VK8RegClass; + else if (RegVT == MVT::v16i1) + RC = &X86::VK16RegClass; + else if (RegVT == MVT::v32i1) + RC = &X86::VK32RegClass; + else if (RegVT == MVT::v64i1) + RC = &X86::VK64RegClass; + else + llvm_unreachable("Unknown argument type!"); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + } + // If this is an 8 or 16-bit value, it is really passed promoted to 32 // bits. Insert an assert[sz]ext to capture this, then truncate to the // right size. @@ -2689,12 +2850,18 @@ // Handle MMX values passed in XMM regs. if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); - else + else if (VA.getValVT().isVector() && + VA.getValVT().getScalarType() == MVT::i1 && + ((RegVT == MVT::i32) || (RegVT == MVT::i64))) { + // Promoting a mask type (v*i1) into a register of type i64/i32 + ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); + } else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); } } else { assert(VA.isMemLoc()); - ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); + ArgValue = + LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); } // If value is passed via pointer - do a load. @@ -2705,7 +2872,7 @@ InVals.push_back(ArgValue); } - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + for (unsigned I = 0, E = Ins.size(); I != E; ++I) { // Swift calling convention does not require we copy the sret argument // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. if (CallConv == CallingConv::Swift) @@ -2715,14 +2882,14 @@ // sret argument into %rax/%eax (depending on ABI) for the return. Save // the argument into a virtual register so that we can access it from the // return points. - if (Ins[i].Flags.isSRet()) { + if (Ins[I].Flags.isSRet()) { unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { MVT PtrTy = getPointerTy(DAG.getDataLayout()); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } - SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); + SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); break; } @@ -3114,15 +3281,17 @@ // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; + ++I, ++OutIndex) { + assert(OutIndex < Outs.size() && "Invalid Out index"); // Skip inalloca arguments, they have already been written. - ISD::ArgFlagsTy Flags = Outs[i].Flags; + ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; if (Flags.isInAlloca()) continue; - CCValAssign &VA = ArgLocs[i]; + CCValAssign &VA = ArgLocs[I]; EVT RegVT = VA.getLocVT(); - SDValue Arg = OutVals[i]; + SDValue Arg = OutVals[OutIndex]; bool isByVal = Flags.isByVal(); // Promote the value if needed. @@ -3138,7 +3307,7 @@ case CCValAssign::AExt: if (Arg.getValueType().isVector() && Arg.getValueType().getVectorElementType() == MVT::i1) - Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); + Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getBitcast(MVT::i64, Arg); @@ -3162,7 +3331,13 @@ } } - if (VA.isRegLoc()) { + if (VA.needsCustom()) { + assert(VA.getValVT() == MVT::v64i1 && + "Currently the only custom case is when we split v64i1 to 2 regs"); + // Split v64i1 value into two registers + Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I], + Subtarget); + } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding @@ -3262,13 +3437,25 @@ SmallVector MemOpChains2; SDValue FIN; int FI = 0; - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - if (VA.isRegLoc()) + for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; + ++I, ++OutsIndex) { + CCValAssign &VA = ArgLocs[I]; + + if (VA.isRegLoc()) { + if (VA.needsCustom()) { + assert((CallConv == CallingConv::X86_RegCall) && + "Expecting custome case only in regcall calling convention"); + // This means that we are in special case where one argument was + // passed through two register locations - Skip the next location + ++I; + } + continue; + } + assert(VA.isMemLoc()); - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; + SDValue Arg = OutVals[OutsIndex]; + ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; // Skip inalloca arguments. They don't require any work. if (Flags.isInAlloca()) continue; Index: test/CodeGen/X86/avx512-regcall-Mask.ll =================================================================== --- test/CodeGen/X86/avx512-regcall-Mask.ll +++ test/CodeGen/X86/avx512-regcall-Mask.ll @@ -0,0 +1,195 @@ +; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx512bw | FileCheck --check-prefix=X32 %s +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512bw | FileCheck --check-prefix=WIN64 %s +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512bw | FileCheck --check-prefix=LINUXOSX64 %s + +; X32-LABEL: test_argv64i1: +; X32: kmovd %edx, %k0 +; X32: kmovd %edi, %k1 +; X32: kmovd %eax, %k1 +; X32: kmovd %ecx, %k2 +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x +; X32: retl + +; WIN64-LABEL: test_argv64i1: +; WIN64: addq %rcx, %rax +; WIN64: addq %rdx, %rax +; WIN64: addq %rdi, %rax +; WIN64: addq %rsi, %rax +; WIN64: addq %r8, %rax +; WIN64: addq %r9, %rax +; WIN64: addq %r10, %rax +; WIN64: addq %r11, %rax +; WIN64: addq %r12, %rax +; WIN64: addq %r14, %rax +; WIN64: addq %r15, %rax +; WIN64: addq {{([0-9])*}}(%rsp), %rax +; WIN64: retq + +; LINUXOSX64-LABEL: test_argv64i1: +; LINUXOSX64: addq %rcx, %rax +; LINUXOSX64: addq %rdx, %rax +; LINUXOSX64: addq %rdi, %rax +; LINUXOSX64: addq %rsi, %rax +; LINUXOSX64: addq %r8, %rax +; LINUXOSX64: addq %r9, %rax +; LINUXOSX64: addq %r12, %rax +; LINUXOSX64: addq %r13, %rax +; LINUXOSX64: addq %r14, %rax +; LINUXOSX64: addq %r15, %rax +; LINUXOSX64: addq {{([0-9])*}}(%rsp), %rax +; LINUXOSX64: addq {{([0-9])*}}(%rsp), %rax +; LINUXOSX64: retq + +; Test regcall when receiving arguments of v64i1 type +define x86_regcallcc i64 @test_argv64i1(<64 x i1> %x0, <64 x i1> %x1, <64 x i1> %x2, + <64 x i1> %x3, <64 x i1> %x4, <64 x i1> %x5, + <64 x i1> %x6, <64 x i1> %x7, <64 x i1> %x8, + <64 x i1> %x9, <64 x i1> %x10, <64 x i1> %x11, + <64 x i1> %x12) { + %y0 = bitcast <64 x i1> %x0 to i64 + %y1 = bitcast <64 x i1> %x1 to i64 + %y2 = bitcast <64 x i1> %x2 to i64 + %y3 = bitcast <64 x i1> %x3 to i64 + %y4 = bitcast <64 x i1> %x4 to i64 + %y5 = bitcast <64 x i1> %x5 to i64 + %y6 = bitcast <64 x i1> %x6 to i64 + %y7 = bitcast <64 x i1> %x7 to i64 + %y8 = bitcast <64 x i1> %x8 to i64 + %y9 = bitcast <64 x i1> %x9 to i64 + %y10 = bitcast <64 x i1> %x10 to i64 + %y11 = bitcast <64 x i1> %x11 to i64 + %y12 = bitcast <64 x i1> %x12 to i64 + %add1 = add i64 %y0, %y1 + %add2 = add i64 %add1, %y2 + %add3 = add i64 %add2, %y3 + %add4 = add i64 %add3, %y4 + %add5 = add i64 %add4, %y5 + %add6 = add i64 %add5, %y6 + %add7 = add i64 %add6, %y7 + %add8 = add i64 %add7, %y8 + %add9 = add i64 %add8, %y9 + %add10 = add i64 %add9, %y10 + %add11 = add i64 %add10, %y11 + %add12 = add i64 %add11, %y12 + ret i64 %add12 +} + +; X32-LABEL: caller_argv64i1: +; X32: movl $2, %eax +; X32: movl $1, %ecx +; X32: movl $2, %edx +; X32: movl $1, %edi +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: pushl ${{1|2}} +; X32: call{{.*}} _test_argv64i1 + +; WIN64-LABEL: caller_argv64i1: +; WIN64: movabsq $4294967298, %rax +; WIN64: movq %rax, (%rsp) +; WIN64: movq %rax, %rcx +; WIN64: movq %rax, %rdx +; WIN64: movq %rax, %rdi +; WIN64: movq %rax, %rsi +; WIN64: movq %rax, %r8 +; WIN64: movq %rax, %r9 +; WIN64: movq %rax, %r10 +; WIN64: movq %rax, %r11 +; WIN64: movq %rax, %r12 +; WIN64: movq %rax, %r14 +; WIN64: movq %rax, %r15 +; WIN64: callq test_argv64i1 + +; LINUXOSX64-LABEL: caller_argv64i1: +; LINUXOSX64: movabsq $4294967298, %rax +; LINUXOSX64: movq %rax, %rcx +; LINUXOSX64: movq %rax, %rdx +; LINUXOSX64: movq %rax, %rdi +; LINUXOSX64: movq %rax, %rsi +; LINUXOSX64: movq %rax, %r8 +; LINUXOSX64: movq %rax, %r9 +; LINUXOSX64: movq %rax, %r12 +; LINUXOSX64: movq %rax, %r13 +; LINUXOSX64: movq %rax, %r14 +; LINUXOSX64: movq %rax, %r15 +; LINUXOSX64: call{{.*}} test_argv64i1 + +; Test regcall when passing arguments of v64i1 type +define x86_regcallcc i64 @caller_argv64i1() #0 { +entry: + %v0 = bitcast i64 4294967298 to <64 x i1> + %call = call x86_regcallcc i64 @test_argv64i1(<64 x i1> %v0, <64 x i1> %v0, <64 x i1> %v0, + <64 x i1> %v0, <64 x i1> %v0, <64 x i1> %v0, + <64 x i1> %v0, <64 x i1> %v0, <64 x i1> %v0, + <64 x i1> %v0, <64 x i1> %v0, <64 x i1> %v0, + <64 x i1> %v0) + ret i64 %call +} + +; X32-LABEL: test_retv64i1: +; X32: mov{{.*}} $2, %eax +; X32: mov{{.*}} $1, %ecx +; X32: ret{{.*}} + +; WIN64-LABEL: test_retv64i1: +; WIN64: mov{{.*}} $4294967298, %rax +; WIN64: ret{{.*}} + +; Test regcall when returning v64i1 type +define x86_regcallcc <64 x i1> @test_retv64i1() { + %a = bitcast i64 4294967298 to <64 x i1> + ret <64 x i1> %a +} + +; X32-LABEL: caller_retv64i1: +; X32: call{{.*}} _test_retv64i1 +; X32: kmov{{.*}} %eax, %k0 +; X32: kmov{{.*}} %ecx, %k1 +; X32: kunpckdq %k0, %k1, %k0 + +; Test regcall when processing result of v64i1 type +define x86_regcallcc <64 x i1> @caller_retv64i1() #0 { +entry: + %call = call x86_regcallcc <64 x i1> @test_retv64i1() + ret <64 x i1> %call +}