Index: lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelLowering.cpp +++ lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -184,6 +184,112 @@ } } +// Check whether we can vectorize argument elements starting at \p +// Idx, using \p AccessSize large accesses. Returns number of elements +// we can process in a single ld/st operation (1 for scalar, 2 or 4 +// for vectors). +static unsigned CanVectorizeAt(unsigned Idx, uint32_t AccessSize, + const SmallVectorImpl &ValueVTs, + const SmallVectorImpl &Offsets, + unsigned ParamAlignment) { + assert(isPowerOf2_32(AccessSize) && "must be a power of 2!"); + + // Can't vectorize if param alignment is not sufficient. + if (AccessSize > ParamAlignment) + return 1; + // Can't vectorize if offset is not aligned. + if (Offsets[Idx] & (AccessSize - 1)) + return 1; + + EVT EltVT = ValueVTs[Idx]; + unsigned EltSize = EltVT.getStoreSize(); + + // Element is too large to vectorize. + if (EltSize >= AccessSize) + return 1; + + unsigned NumElts = AccessSize / EltSize; + // We don't have enough elements to vectorize. + if (Idx + NumElts > ValueVTs.size()) + return 1; + + // Can't vectorize if AccessBytes if not a multiple of EltSize. + if (AccessSize != EltSize * NumElts) + return 1; + + // PTX ISA can only deal with 2 and 4 element vector ops. + if (NumElts != 4 && NumElts != 2) + return 1; + + for (unsigned j = 1; j < NumElts; ++j) { + // Types do not match. + if (ValueVTs[Idx + j] != EltVT) + return 1; + + // Elements are not contiguous. + if (Offsets[Idx + j] - Offsets[Idx + j - 1] != EltSize) + return 1; + } + // OK. We can vectorize ValueVTs[i..i+NumElts) + return NumElts; +} + +enum PtxVectorInfo { + PTX_LDST_VECTORIZED = 0x0, // Middle elements of a vector. + PTX_LDST_BEGIN = 0x1, // First element of the vector. + PTX_LDST_END = 0x2, // Last element of the vector. + // Scalar is effectively a 1-element vector. + PTX_LDST_SCALAR = PTX_LDST_BEGIN | PTX_LDST_END +}; + +// For each pair of elements in \p ValueVTs, \p Offsets produced by +// ComputePTXValueVTs() for a function argument, and the argument +// alignment \p ParamAlignment, returns \p VectorInfo with each +// element indicating whether its load/store can be vectorized. +static void VectorizePTXValueVTs(const SmallVectorImpl &ValueVTs, + const SmallVectorImpl &Offsets, + unsigned ParamAlignment, + SmallVectorImpl &VectorInfo) { + // Set vector size to match ValueVTs and mark all elements as + // scalars by default. + VectorInfo.assign(ValueVTs.size(), PTX_LDST_SCALAR); + + // Check what we can vectorize using 128/64/32-bit accesses. + for (int I = 0, E = ValueVTs.size(); I != E; ++I) { + // Skip elements we've already processed. + assert(VectorInfo[I] == PTX_LDST_SCALAR && "Unexpected vector info state."); + for (unsigned AccessSize : {16, 8, 4, 2}) { + unsigned NumElts = + CanVectorizeAt(I, AccessSize, ValueVTs, Offsets, ParamAlignment); + // Mark vectorized elements. + switch (NumElts) { + default: + llvm_unreachable("Unexpected return value"); + case 1: + // Can't vectorize using this size, try next smaller size. + continue; + case 2: + assert(I + 1 < E && "Not enough elements."); + VectorInfo[I] = PTX_LDST_BEGIN; + VectorInfo[I + 1] = PTX_LDST_END; + I += 1; + break; + case 4: + assert(I + 3 < E && "Not enough elements."); + VectorInfo[I] = PTX_LDST_BEGIN; + VectorInfo[I + 1] = PTX_LDST_VECTORIZED; + VectorInfo[I + 2] = PTX_LDST_VECTORIZED; + VectorInfo[I + 3] = PTX_LDST_END; + I += 3; + break; + } + // Break out of the inner loop because we've already succeeded + // using largest possible AccessSize. + break; + } + } +} + // NVPTXTargetLowering Constructor. NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI) @@ -1276,21 +1382,18 @@ SDValue Callee = CLI.Callee; bool &isTailCall = CLI.IsTailCall; ArgListTy &Args = CLI.getArgs(); - Type *retTy = CLI.RetTy; + Type *RetTy = CLI.RetTy; ImmutableCallSite *CS = CLI.CS; + const DataLayout &DL = DAG.getDataLayout(); bool isABI = (STI.getSmVersion() >= 20); assert(isABI && "Non-ABI compilation is not supported"); if (!isABI) return Chain; - MachineFunction &MF = DAG.getMachineFunction(); - const Function *F = MF.getFunction(); - auto &DL = MF.getDataLayout(); SDValue tempChain = Chain; - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getIntPtrConstant(uniqueCallSite, dl, true), - dl); + Chain = DAG.getCALLSEQ_START( + Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl); SDValue InFlag = Chain.getValue(1); unsigned paramCount = 0; @@ -1311,244 +1414,124 @@ Type *Ty = Args[i].Ty; if (!Outs[OIdx].Flags.isByVal()) { - if (Ty->isAggregateType()) { - // aggregate - SmallVector vtparts; - SmallVector Offsets; - ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets, - 0); - - unsigned align = - getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); - // declare .param .align .b8 .param[]; - unsigned sz = DL.getTypeAllocSize(Ty); - SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl, - MVT::i32), - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(sz, dl, MVT::i32), - InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps); - InFlag = Chain.getValue(1); - for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - EVT elemtype = vtparts[j]; - unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - SDValue StVal = OutVals[OIdx]; - if (elemtype.getSizeInBits() < 16) { - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(Offsets[j], dl, MVT::i32), - StVal, InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, - CopyParamVTs, CopyParamOps, - elemtype, MachinePointerInfo(), - ArgAlign); - InFlag = Chain.getValue(1); - ++OIdx; - } - if (vtparts.size() > 0) - --OIdx; - ++paramCount; - continue; - } - if (Ty->isVectorTy()) { - EVT ObjectVT = getValueType(DL, Ty); - unsigned align = - getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); - // declare .param .align .b8 .param[]; - unsigned sz = DL.getTypeAllocSize(Ty); - SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareParamOps[] = { Chain, - DAG.getConstant(align, dl, MVT::i32), - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(sz, dl, MVT::i32), - InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps); - InFlag = Chain.getValue(1); - unsigned NumElts = ObjectVT.getVectorNumElements(); - EVT EltVT = ObjectVT.getVectorElementType(); - EVT MemVT = EltVT; - bool NeedExtend = false; - if (EltVT.getSizeInBits() < 16) { - NeedExtend = true; - EltVT = MVT::i16; - } - - // V1 store - if (NumElts == 1) { - SDValue Elt = OutVals[OIdx++]; - if (NeedExtend) - Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt); - - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), Elt, - InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, - CopyParamVTs, CopyParamOps, - MemVT, MachinePointerInfo()); - InFlag = Chain.getValue(1); - } else if (NumElts == 2) { - SDValue Elt0 = OutVals[OIdx++]; - SDValue Elt1 = OutVals[OIdx++]; - if (NeedExtend) { - Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0); - Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1); - } - - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), Elt0, - Elt1, InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl, - CopyParamVTs, CopyParamOps, - MemVT, MachinePointerInfo()); - InFlag = Chain.getValue(1); - } else { - unsigned curOffset = 0; - // V4 stores - // We have at least 4 elements (<3 x Ty> expands to 4 elements) and - // the - // vector will be expanded to a power of 2 elements, so we know we can - // always round up to the next multiple of 4 when creating the vector - // stores. - // e.g. 4 elem => 1 st.v4 - // 6 elem => 2 st.v4 - // 8 elem => 2 st.v4 - // 11 elem => 3 st.v4 - unsigned VecSize = 4; - if (EltVT.getSizeInBits() == 64) - VecSize = 2; - - // This is potentially only part of a vector, so assume all elements - // are packed together. - unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize; - - for (unsigned i = 0; i < NumElts; i += VecSize) { - // Get values - SDValue StoreVal; - SmallVector Ops; - Ops.push_back(Chain); - Ops.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); - Ops.push_back(DAG.getConstant(curOffset, dl, MVT::i32)); - - unsigned Opc = NVPTXISD::StoreParamV2; - - StoreVal = OutVals[OIdx++]; - if (NeedExtend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); - Ops.push_back(StoreVal); - - if (i + 1 < NumElts) { - StoreVal = OutVals[OIdx++]; - if (NeedExtend) - StoreVal = - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); - } else { - StoreVal = DAG.getUNDEF(EltVT); - } - Ops.push_back(StoreVal); - - if (VecSize == 4) { - Opc = NVPTXISD::StoreParamV4; - if (i + 2 < NumElts) { - StoreVal = OutVals[OIdx++]; - if (NeedExtend) - StoreVal = - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); - } else { - StoreVal = DAG.getUNDEF(EltVT); - } - Ops.push_back(StoreVal); - - if (i + 3 < NumElts) { - StoreVal = OutVals[OIdx++]; - if (NeedExtend) - StoreVal = - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); - } else { - StoreVal = DAG.getUNDEF(EltVT); - } - Ops.push_back(StoreVal); - } - - Ops.push_back(InFlag); - - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops, - MemVT, MachinePointerInfo()); - InFlag = Chain.getValue(1); - curOffset += PerStoreOffset; - } - } - ++paramCount; - --OIdx; - continue; - } - // Plain scalar - // for ABI, declare .param .b .param; - unsigned sz = VT.getSizeInBits(); - bool needExtend = false; - if (VT.isInteger()) { - if (sz < 16) - needExtend = true; - if (sz < 32) - sz = 32; - } else if (VT.isFloatingPoint() && sz < 32) - // PTX ABI requires all scalar parameters to be at least 32 - // bits in size. fp16 normally uses .b16 as its storage type - // in PTX, so its size must be adjusted here, too. - sz = 32; + // aggregate + SmallVector VTs; + SmallVector Offsets; + ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets); + unsigned ArgAlign = + getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); + // declare .param .align .b8 .param[]; + unsigned AllocSize = DL.getTypeAllocSize(Ty); SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(sz, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, - DeclareParamOps); - InFlag = Chain.getValue(1); - SDValue OutV = OutVals[OIdx]; - if (needExtend) { - // zext/sext i1 to i16 - unsigned opc = ISD::ZERO_EXTEND; - if (Outs[OIdx].Flags.isSExt()) - opc = ISD::SIGN_EXTEND; - OutV = DAG.getNode(opc, dl, MVT::i16, OutV); + bool NeedAlign; // Does argument declaration specify alignment? + if (Ty->isAggregateType() || Ty->isVectorTy()) { + SDValue DeclareParamOps[] = { + Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), + DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(AllocSize, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, + DeclareParamOps); + NeedAlign = true; + } else { + if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) { + // PTX ABI requires integral types to be at least 32 bits in + // size. FP16 is loaded/stored using i16, so it's handled + // here as well. + AllocSize = 4; + } + SDValue DeclareScalarParamOps[] = { + Chain, DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(AllocSize * 8, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, + DeclareScalarParamOps); + NeedAlign = false; } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), OutV, - InFlag }; - - unsigned opcode = NVPTXISD::StoreParam; - if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32) - opcode = NVPTXISD::StoreParamU32; - else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32) - opcode = NVPTXISD::StoreParamS32; - Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps, - VT, MachinePointerInfo()); - InFlag = Chain.getValue(1); + + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter + // than 32-bits are sign extended or zero extended, depending on + // whether they are signed or unsigned types. This case applies + // only to scalar parameters and not to aggregate values. + bool ExtendIntegerRetVal = + Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; + + SmallVector VectorInfo; + VectorizePTXValueVTs(VTs, Offsets, ArgAlign, VectorInfo); + SmallVector LdStOps; + for (unsigned j = 0, je = VTs.size(); j != je; ++j) { + // New load/store. + if (VectorInfo[j] & PTX_LDST_BEGIN) { + assert(LdStOps.empty() && "Orphaned operand list"); + LdStOps.push_back(Chain); + LdStOps.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); + LdStOps.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); + } + + EVT EltVT = VTs[j]; + SDValue StVal = OutVals[OIdx]; + if (ExtendIntegerRetVal) { + assert(VTs.size() == 1 && "Scalar can't have multiple parts."); + // zext/sext to i32 + StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, + dl, MVT::i32, StVal); + } else if (EltVT.getSizeInBits() < 16) { + // Use 16-bit registers for small load-stores as it's the + // smallest general purpose register size supported by NVPTX. + StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); + } + + // Record the value to store. + LdStOps.push_back(StVal); + + if (VectorInfo[j] & PTX_LDST_END) { + unsigned NumElts = LdStOps.size() - 3; + NVPTXISD::NodeType Op; + switch (NumElts) { + case 1: + Op = NVPTXISD::StoreParam; + break; + case 2: + Op = NVPTXISD::StoreParamV2; + break; + case 4: + Op = NVPTXISD::StoreParamV4; + break; + default: + llvm_unreachable("Invalid vector info."); + } + + LdStOps.push_back(InFlag); + + // Adjust type of load/store op if we've extended the scalar + // return value. + EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[j]; + unsigned EltAlign = + NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0; + + Chain = DAG.getMemIntrinsicNode( + Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), LdStOps, + TheStoreType, MachinePointerInfo(), EltAlign); + InFlag = Chain.getValue(1); + + // Cleanup. + LdStOps.clear(); + } + ++OIdx; + } + if (VTs.size() > 0) + --OIdx; ++paramCount; continue; } - // struct or vector - SmallVector vtparts; + + // ByVal arguments + SmallVector VTs; SmallVector Offsets; auto *PTy = dyn_cast(Args[i].Ty); assert(PTy && "Type of a byval parameter should be pointer"); - ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(), - vtparts, &Offsets, 0); + ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0); // declare .param .align .b8 .param[]; unsigned sz = Outs[OIdx].Flags.getByValSize(); @@ -1569,11 +1552,11 @@ Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, DeclareParamOps); InFlag = Chain.getValue(1); - for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - EVT elemtype = vtparts[j]; + for (unsigned j = 0, je = VTs.size(); j != je; ++j) { + EVT elemtype = VTs[j]; int curOffset = Offsets[j]; unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); - auto PtrVT = getPointerTy(DAG.getDataLayout()); + auto PtrVT = getPointerTy(DL); SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], DAG.getConstant(curOffset, dl, PtrVT)); SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, @@ -1601,18 +1584,18 @@ // Handle Result if (Ins.size() > 0) { SmallVector resvtparts; - ComputeValueVTs(*this, DL, retTy, resvtparts); + ComputeValueVTs(*this, DL, RetTy, resvtparts); // Declare // .param .align 16 .b8 retval0[], or // .param .b retval0 - unsigned resultsz = DL.getTypeAllocSizeInBits(retTy); + unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); // Emit ".param .b retval0" instead of byte arrays only for // these three types to match the logic in // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. // Plus, this behavior is consistent with nvcc's. - if (retTy->isFloatingPointTy() || retTy->isIntegerTy() || - retTy->isPointerTy()) { + if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy() || + RetTy->isPointerTy()) { // Scalar needs to be at least 32bit wide if (resultsz < 32) resultsz = 32; @@ -1624,7 +1607,7 @@ DeclareRetOps); InFlag = Chain.getValue(1); } else { - retAlignment = getArgumentAlignment(Callee, CS, retTy, 0, DL); + retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL); SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue DeclareRetOps[] = { Chain, DAG.getConstant(retAlignment, dl, MVT::i32), @@ -1646,7 +1629,7 @@ // CallPrototype SDNode which will print out to the value of the string. SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); std::string Proto = - getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS); + getPrototype(DL, RetTy, Args, Outs, retAlignment, CS); const char *ProtoStr = nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); SDValue ProtoOps[] = { @@ -1711,175 +1694,85 @@ // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { - if (retTy && retTy->isVectorTy()) { - EVT ObjectVT = getValueType(DL, retTy); - unsigned NumElts = ObjectVT.getVectorNumElements(); - EVT EltVT = ObjectVT.getVectorElementType(); - assert(STI.getTargetLowering()->getNumRegisters(F->getContext(), - ObjectVT) == NumElts && - "Vector was not scalarized"); - unsigned sz = EltVT.getSizeInBits(); - bool needTruncate = sz < 8; + SmallVector VTs; + SmallVector Offsets; + ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); + assert(VTs.size() == Ins.size() && "Bad value decomposition"); - if (NumElts == 1) { - // Just a simple load - SmallVector LoadRetVTs; - if (EltVT == MVT::i1 || EltVT == MVT::i8) { - // If loading i1/i8 result, generate - // load.b8 i16 - // if i1 - // trunc i16 to i1 - LoadRetVTs.push_back(MVT::i16); - } else - LoadRetVTs.push_back(EltVT); - LoadRetVTs.push_back(MVT::Other); - LoadRetVTs.push_back(MVT::Glue); - SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), InFlag}; - SDValue retval = DAG.getMemIntrinsicNode( - NVPTXISD::LoadParam, dl, - DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo()); - Chain = retval.getValue(1); - InFlag = retval.getValue(2); - SDValue Ret0 = retval; - if (needTruncate) - Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0); - InVals.push_back(Ret0); - } else if (NumElts == 2) { - // LoadV2 - SmallVector LoadRetVTs; - if (EltVT == MVT::i1 || EltVT == MVT::i8) { - // If loading i1/i8 result, generate - // load.b8 i16 - // if i1 - // trunc i16 to i1 - LoadRetVTs.push_back(MVT::i16); - LoadRetVTs.push_back(MVT::i16); - } else { - LoadRetVTs.push_back(EltVT); - LoadRetVTs.push_back(EltVT); - } - LoadRetVTs.push_back(MVT::Other); - LoadRetVTs.push_back(MVT::Glue); - SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), InFlag}; - SDValue retval = DAG.getMemIntrinsicNode( - NVPTXISD::LoadParamV2, dl, - DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo()); - Chain = retval.getValue(2); - InFlag = retval.getValue(3); - SDValue Ret0 = retval.getValue(0); - SDValue Ret1 = retval.getValue(1); - if (needTruncate) { - Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0); - InVals.push_back(Ret0); - Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1); - InVals.push_back(Ret1); - } else { - InVals.push_back(Ret0); - InVals.push_back(Ret1); - } - } else { - // Split into N LoadV4 - unsigned Ofst = 0; - unsigned VecSize = 4; - unsigned Opc = NVPTXISD::LoadParamV4; - if (EltVT.getSizeInBits() == 64) { - VecSize = 2; - Opc = NVPTXISD::LoadParamV2; - } - EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); - for (unsigned i = 0; i < NumElts; i += VecSize) { - SmallVector LoadRetVTs; - if (EltVT == MVT::i1 || EltVT == MVT::i8) { - // If loading i1/i8 result, generate - // load.b8 i16 - // if i1 - // trunc i16 to i1 - for (unsigned j = 0; j < VecSize; ++j) - LoadRetVTs.push_back(MVT::i16); - } else { - for (unsigned j = 0; j < VecSize; ++j) - LoadRetVTs.push_back(EltVT); - } - LoadRetVTs.push_back(MVT::Other); - LoadRetVTs.push_back(MVT::Glue); - SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(Ofst, dl, MVT::i32), InFlag}; - SDValue retval = DAG.getMemIntrinsicNode( - Opc, dl, DAG.getVTList(LoadRetVTs), - LoadRetOps, EltVT, MachinePointerInfo()); - if (VecSize == 2) { - Chain = retval.getValue(2); - InFlag = retval.getValue(3); - } else { - Chain = retval.getValue(4); - InFlag = retval.getValue(5); - } + unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL); + SmallVector VectorInfo; + VectorizePTXValueVTs(VTs, Offsets, RetAlign, VectorInfo); - for (unsigned j = 0; j < VecSize; ++j) { - if (i + j >= NumElts) - break; - SDValue Elt = retval.getValue(j); - if (needTruncate) - Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); - InVals.push_back(Elt); - } - Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); - } - } - } else { - SmallVector VTs; - SmallVector Offsets; - auto &DL = DAG.getDataLayout(); - ComputePTXValueVTs(*this, DL, retTy, VTs, &Offsets, 0); - assert(VTs.size() == Ins.size() && "Bad value decomposition"); - unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0, DL); - for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - unsigned sz = VTs[i].getSizeInBits(); - unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]); - bool needTruncate = false; - if (VTs[i].isInteger() && sz < 8) { - sz = 8; + SmallVector LoadVTs; + int VecIdx = -1; // Index of the first element of the vector. + + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than + // 32-bits are sign extended or zero extended, depending on whether + // they are signed or unsigned types. + bool ExtendIntegerRetVal = + RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; + + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + bool needTruncate = false; + EVT TheLoadType = VTs[i]; + EVT EltType = Ins[i].VT; + unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]); + if (ExtendIntegerRetVal) { + TheLoadType = MVT::i32; + EltType = MVT::i32; + needTruncate = true; + } else if (TheLoadType.getSizeInBits() < 16) { + if (VTs[i].isInteger()) needTruncate = true; + EltType = MVT::i16; + } + + // Record index of the very first element of the vector. + if (VectorInfo[i] & PTX_LDST_BEGIN) { + assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); + VecIdx = i; + } + + LoadVTs.push_back(EltType); + + if (VectorInfo[i] & PTX_LDST_END) { + unsigned NumElts = LoadVTs.size(); + LoadVTs.push_back(MVT::Other); + LoadVTs.push_back(MVT::Glue); + NVPTXISD::NodeType Op; + switch (NumElts) { + case 1: + Op = NVPTXISD::LoadParam; + break; + case 2: + Op = NVPTXISD::LoadParamV2; + break; + case 4: + Op = NVPTXISD::LoadParamV4; + break; + default: + llvm_unreachable("Invalid vector info."); } - SmallVector LoadRetVTs; - EVT TheLoadType = VTs[i]; - if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) { - // This is for integer types only, and specifically not for - // aggregates. - LoadRetVTs.push_back(MVT::i32); - TheLoadType = MVT::i32; - needTruncate = true; - } else if (sz < 16) { - // If loading i1/i8 result, generate - // load i8 (-> i16) - // trunc i16 to i1/i8 + SDValue VectorOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), + InFlag}; + SDValue RetVal = DAG.getMemIntrinsicNode( + Op, dl, DAG.getVTList(LoadVTs), VectorOps, TheLoadType, + MachinePointerInfo(), EltAlign); - // FIXME: Do we need to set needTruncate to true here, too? We could - // not figure out what this branch is for in D17872, so we left it - // alone. The comment above about loading i1/i8 may be wrong, as the - // branch above seems to cover integers of size < 32. - LoadRetVTs.push_back(MVT::i16); - } else - LoadRetVTs.push_back(Ins[i].VT); - LoadRetVTs.push_back(MVT::Other); - LoadRetVTs.push_back(MVT::Glue); + for (unsigned j = 0; j < NumElts; ++j) { + SDValue Ret = RetVal.getValue(j); + if (needTruncate) + Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret); + InVals.push_back(Ret); + } + Chain = RetVal.getValue(NumElts); + InFlag = RetVal.getValue(NumElts + 1); - SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(Offsets[i], dl, MVT::i32), - InFlag}; - SDValue retval = DAG.getMemIntrinsicNode( - NVPTXISD::LoadParam, dl, - DAG.getVTList(LoadRetVTs), LoadRetOps, - TheLoadType, MachinePointerInfo(), AlignI); - Chain = retval.getValue(1); - InFlag = retval.getValue(2); - SDValue Ret0 = retval.getValue(0); - if (needTruncate) - Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0); - InVals.push_back(Ret0); + // Cleanup + VecIdx = -1; + LoadVTs.clear(); } } } @@ -2371,176 +2264,70 @@ // appear in the same order as their order of appearance // in the original function. "idx+1" holds that order. if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) { - if (Ty->isAggregateType()) { - SmallVector vtparts; - SmallVector offsets; + SmallVector VTs; + SmallVector Offsets; + SmallVector VectorInfo; - // NOTE: Here, we lose the ability to issue vector loads for vectors - // that are a part of a struct. This should be investigated in the - // future. - ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets, - 0); - assert(vtparts.size() > 0 && "empty aggregate type not expected"); - bool aggregateIsPacked = false; - if (StructType *STy = dyn_cast(Ty)) - aggregateIsPacked = STy->isPacked(); + ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); + assert(VTs.size() > 0 && "empty aggregate type not expected"); - SDValue Arg = getParamSymbol(DAG, idx, PtrVT); - for (unsigned parti = 0, parte = vtparts.size(); parti != parte; - ++parti) { - EVT partVT = vtparts[parti]; - Value *srcValue = Constant::getNullValue( - PointerType::get(partVT.getTypeForEVT(F->getContext()), - ADDRESS_SPACE_PARAM)); - SDValue srcAddr = + bool aggregateIsPacked = false; + if (StructType *STy = dyn_cast(Ty)) + aggregateIsPacked = STy->isPacked(); + + VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty), + VectorInfo); + + SDValue Arg = getParamSymbol(DAG, idx, PtrVT); + int VecIdx = -1; // Index of the first element of the current vector. + for (unsigned parti = 0, parte = VTs.size(); parti != parte; + ++parti) { + if (VectorInfo[parti] & PTX_LDST_BEGIN) { + assert(VecIdx == -1 && "Orphaned vector."); + VecIdx = parti; + } + + // That's the last element of this store op. + if (VectorInfo[parti] & PTX_LDST_END) { + unsigned NumElts = parti - VecIdx + 1; + EVT EltVT = VTs[parti]; + // i1 is loaded/stored as i8. + EVT LoadVT = EltVT == MVT::i1 ? MVT::i8 : EltVT; + EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); + SDValue VecAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, - DAG.getConstant(offsets[parti], dl, PtrVT)); - unsigned partAlign = aggregateIsPacked - ? 1 - : DL.getABITypeAlignment( - partVT.getTypeForEVT(F->getContext())); - SDValue p; - if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) { - ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? - ISD::SEXTLOAD : ISD::ZEXTLOAD; - p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr, - MachinePointerInfo(srcValue), partVT, partAlign); - } else { - p = DAG.getLoad(partVT, dl, Root, srcAddr, - MachinePointerInfo(srcValue), partAlign); - } - if (p.getNode()) - p.getNode()->setIROrder(idx + 1); - InVals.push_back(p); - ++InsIdx; - } - if (vtparts.size() > 0) - --InsIdx; - continue; - } - if (Ty->isVectorTy()) { - EVT ObjectVT = getValueType(DL, Ty); - SDValue Arg = getParamSymbol(DAG, idx, PtrVT); - unsigned NumElts = ObjectVT.getVectorNumElements(); - assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts && - "Vector was not scalarized"); - EVT EltVT = ObjectVT.getVectorElementType(); - - // V1 load - // f32 = load ... - if (NumElts == 1) { - // We only have one element, so just directly load it - Value *SrcValue = Constant::getNullValue(PointerType::get( + DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); + Value *srcValue = Constant::getNullValue(PointerType::get( EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); - SDValue P = DAG.getLoad( - EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), - DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())), - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + SDValue P = + DAG.getLoad(VecVT, dl, Root, VecAddr, + MachinePointerInfo(srcValue), aggregateIsPacked, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); if (P.getNode()) P.getNode()->setIROrder(idx + 1); - - if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) - P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P); - InVals.push_back(P); - ++InsIdx; - } else if (NumElts == 2) { - // V2 load - // f32,f32 = load ... - EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2); - Value *SrcValue = Constant::getNullValue(PointerType::get( - VecVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); - SDValue P = DAG.getLoad( - VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), - DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())), - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); - if (P.getNode()) - P.getNode()->setIROrder(idx + 1); - - SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P, - DAG.getIntPtrConstant(0, dl)); - SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P, - DAG.getIntPtrConstant(1, dl)); - - if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) { - Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0); - Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1); - } - - InVals.push_back(Elt0); - InVals.push_back(Elt1); - InsIdx += 2; - } else { - // V4 loads - // We have at least 4 elements (<3 x Ty> expands to 4 elements) and - // the vector will be expanded to a power of 2 elements, so we know we - // can always round up to the next multiple of 4 when creating the - // vector loads. - // e.g. 4 elem => 1 ld.v4 - // 6 elem => 2 ld.v4 - // 8 elem => 2 ld.v4 - // 11 elem => 3 ld.v4 - unsigned VecSize = 4; - if (EltVT.getSizeInBits() == 64) { - VecSize = 2; - } - EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); - unsigned Ofst = 0; - for (unsigned i = 0; i < NumElts; i += VecSize) { - Value *SrcValue = Constant::getNullValue( - PointerType::get(VecVT.getTypeForEVT(F->getContext()), - ADDRESS_SPACE_PARAM)); - SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, - DAG.getConstant(Ofst, dl, PtrVT)); - SDValue P = DAG.getLoad( - VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), - DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())), - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); - if (P.getNode()) - P.getNode()->setIROrder(idx + 1); - - for (unsigned j = 0; j < VecSize; ++j) { - if (i + j >= NumElts) - break; - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P, - DAG.getIntPtrConstant(j, dl)); - if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) - Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt); - InVals.push_back(Elt); + for (unsigned j = 0; j < NumElts; ++j) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, + DAG.getIntPtrConstant(j, dl)); + // We've loaded i1 as an i8 and now must truncate it back to i1 + if (EltVT == MVT::i1) + Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); + // Extend the element if necesary (e.g an i8 is loaded + // into an i16 register) + if (Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) { + unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; + Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); } - Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); + InVals.push_back(Elt); } - InsIdx += NumElts; + // Reset vector tracking state. + VecIdx = -1; } - - if (NumElts > 0) - --InsIdx; - continue; + ++InsIdx; } - // A plain scalar. - EVT ObjectVT = getValueType(DL, Ty); - // If ABI, load from the param symbol - SDValue Arg = getParamSymbol(DAG, idx, PtrVT); - Value *srcValue = Constant::getNullValue(PointerType::get( - ObjectVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); - SDValue p; - if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) { - ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? - ISD::SEXTLOAD : ISD::ZEXTLOAD; - p = DAG.getExtLoad( - ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue), - ObjectVT, - DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); - } else { - p = DAG.getLoad( - Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), - DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); - } - if (p.getNode()) - p.getNode()->setIROrder(idx + 1); - InVals.push_back(p); + if (VTs.size() > 0) + --InsIdx; continue; } @@ -2582,165 +2369,83 @@ const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const Function *F = MF.getFunction(); - Type *RetTy = F->getReturnType(); - const DataLayout &TD = DAG.getDataLayout(); + Type *RetTy = MF.getFunction()->getReturnType(); bool isABI = (STI.getSmVersion() >= 20); assert(isABI && "Non-ABI compilation is not supported"); if (!isABI) return Chain; - if (VectorType *VTy = dyn_cast(RetTy)) { - // If we have a vector type, the OutVals array will be the scalarized - // components and we have combine them into 1 or more vector stores. - unsigned NumElts = VTy->getNumElements(); - assert(NumElts == Outs.size() && "Bad scalarization of return value"); + const DataLayout DL = DAG.getDataLayout(); + SmallVector VTs; + SmallVector Offsets; + ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); + assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); - // const_cast can be removed in later LLVM versions - EVT EltVT = getValueType(TD, RetTy).getVectorElementType(); - bool NeedExtend = false; - if (EltVT.getSizeInBits() < 16) - NeedExtend = true; + SmallVector VectorInfo; + VectorizePTXValueVTs(VTs, Offsets, + RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1, + VectorInfo); - // V1 store - if (NumElts == 1) { - SDValue StoreVal = OutVals[0]; - // We only have one element, so just directly store it - if (NeedExtend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); - SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, - DAG.getVTList(MVT::Other), Ops, - EltVT, MachinePointerInfo()); - } else if (NumElts == 2) { - // V2 store - SDValue StoreVal0 = OutVals[0]; - SDValue StoreVal1 = OutVals[1]; + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than + // 32-bits are sign extended or zero extended, depending on whether + // they are signed or unsigned types. + bool ExtendIntegerRetVal = + RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; + bool aggregateIsPacked = false; - if (NeedExtend) { - StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0); - StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1); - } - - SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal0, - StoreVal1 }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl, - DAG.getVTList(MVT::Other), Ops, - EltVT, MachinePointerInfo()); - } else { - // V4 stores - // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the - // vector will be expanded to a power of 2 elements, so we know we can - // always round up to the next multiple of 4 when creating the vector - // stores. - // e.g. 4 elem => 1 st.v4 - // 6 elem => 2 st.v4 - // 8 elem => 2 st.v4 - // 11 elem => 3 st.v4 - - unsigned VecSize = 4; - if (OutVals[0].getValueSizeInBits() == 64) - VecSize = 2; - - unsigned Offset = 0; - - EVT VecVT = - EVT::getVectorVT(F->getContext(), EltVT, VecSize); - unsigned PerStoreOffset = - TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); - - for (unsigned i = 0; i < NumElts; i += VecSize) { - // Get values - SDValue StoreVal; - SmallVector Ops; - Ops.push_back(Chain); - Ops.push_back(DAG.getConstant(Offset, dl, MVT::i32)); - unsigned Opc = NVPTXISD::StoreRetvalV2; - EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType(); + if (StructType *STy = dyn_cast(RetTy)) + aggregateIsPacked = STy->isPacked(); - StoreVal = OutVals[i]; - if (NeedExtend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); - Ops.push_back(StoreVal); - - if (i + 1 < NumElts) { - StoreVal = OutVals[i + 1]; - if (NeedExtend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); - } else { - StoreVal = DAG.getUNDEF(ExtendedVT); - } - Ops.push_back(StoreVal); - - if (VecSize == 4) { - Opc = NVPTXISD::StoreRetvalV4; - if (i + 2 < NumElts) { - StoreVal = OutVals[i + 2]; - if (NeedExtend) - StoreVal = - DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); - } else { - StoreVal = DAG.getUNDEF(ExtendedVT); - } - Ops.push_back(StoreVal); - - if (i + 3 < NumElts) { - StoreVal = OutVals[i + 3]; - if (NeedExtend) - StoreVal = - DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); - } else { - StoreVal = DAG.getUNDEF(ExtendedVT); - } - Ops.push_back(StoreVal); - } + SmallVector LdStOps; + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + // New load/store. Record chain and offset operands. + if (VectorInfo[i] & PTX_LDST_BEGIN) { + assert(LdStOps.empty() && "Orphaned operand list."); + LdStOps.push_back(Chain); + LdStOps.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); + } - // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size()); - Chain = - DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops, - EltVT, MachinePointerInfo()); - Offset += PerStoreOffset; - } + SDValue RetVal = OutVals[i]; + if (ExtendIntegerRetVal) { + RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, + dl, MVT::i32, RetVal); + } else if (RetVal.getValueSizeInBits() < 16) { + // Use 16-bit registers for small load-stores as it's the + // smallest general purpose register size supported by NVPTX. + RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); } - } else { - SmallVector ValVTs; - SmallVector Offsets; - ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0); - assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition"); - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { - SDValue theVal = OutVals[i]; - EVT TheValType = theVal.getValueType(); - unsigned numElems = 1; - if (TheValType.isVector()) - numElems = TheValType.getVectorNumElements(); - for (unsigned j = 0, je = numElems; j != je; ++j) { - SDValue TmpVal = theVal; - if (TheValType.isVector()) - TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - TheValType.getVectorElementType(), TmpVal, - DAG.getIntPtrConstant(j, dl)); - EVT TheStoreType = ValVTs[i]; - if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) { - // The following zero-extension is for integer types only, and - // specifically not for aggregates. - TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal); - TheStoreType = MVT::i32; - } else if (RetTy->isHalfTy()) { - TheStoreType = MVT::f16; - } else if (TmpVal.getValueSizeInBits() < 16) - TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal); + // Record the value to return. + LdStOps.push_back(RetVal); - SDValue Ops[] = { - Chain, - DAG.getConstant(Offsets[i], dl, MVT::i32), - TmpVal }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, - DAG.getVTList(MVT::Other), Ops, - TheStoreType, - MachinePointerInfo()); + // That's the last element of this store op. + if (VectorInfo[i] & PTX_LDST_END) { + NVPTXISD::NodeType Op; + unsigned NumElts = LdStOps.size() - 2; + switch (NumElts) { + case 1: + Op = NVPTXISD::StoreRetval; + break; + case 2: + Op = NVPTXISD::StoreRetvalV2; + break; + case 4: + Op = NVPTXISD::StoreRetvalV4; + break; + default: + llvm_unreachable("Invalid vector info."); } + + // Adjust type of load/store op if we've extended the scalar + // return value. + EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; + Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other), + LdStOps, TheStoreType, + MachinePointerInfo(), 1); + // Cleanup vector state. + LdStOps.clear(); } } Index: test/CodeGen/NVPTX/aggregate-return.ll =================================================================== --- test/CodeGen/NVPTX/aggregate-return.ll +++ test/CodeGen/NVPTX/aggregate-return.ll @@ -1,21 +1,40 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s declare <2 x float> @barv(<2 x float> %input) +declare <3 x float> @barv3(<3 x float> %input) declare [2 x float] @bara([2 x float] %input) declare {float, float} @bars({float, float} %input) -define void @foov(<2 x float> %input, <2 x float>* %output) { -; CHECK-LABEL: @foov +define void @test_v2f32(<2 x float> %input, <2 x float>* %output) { +; CHECK-LABEL: @test_v2f32 %call = tail call <2 x float> @barv(<2 x float> %input) ; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: ld.param.v2.f32 {[[ELEMV1:%f[0-9]+]], [[ELEMV2:%f[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0]; store <2 x float> %call, <2 x float>* %output, align 8 -; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEMV1]], [[ELEMV2]]} +; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]} ret void } -define void @fooa([2 x float] %input, [2 x float]* %output) { -; CHECK-LABEL: @fooa +define void @test_v3f32(<3 x float> %input, <3 x float>* %output) { +; CHECK-LABEL: @test_v3f32 +; + %call = tail call <3 x float> @barv3(<3 x float> %input) +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8]; +; Make sure we don't load more values than than we need to. +; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12]; + store <3 x float> %call, <3 x float>* %output, align 8 +; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8], +; -- This is suboptimal. We should do st.v2.f32 instead +; of combining 2xf32 info i64. +; CHECK-DAG: st.u64 [{{%rd[0-9]}}], +; CHECK: ret; + ret void +} + +define void @test_a2f32([2 x float] %input, [2 x float]* %output) { +; CHECK-LABEL: @test_a2f32 %call = tail call [2 x float] @bara([2 x float] %input) ; CHECK: .param .align 4 .b8 retval0[8]; ; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0+0]; @@ -28,8 +47,8 @@ ; CHECK: ret } -define void @foos({float, float} %input, {float, float}* %output) { -; CHECK-LABEL: @foos +define void @test_s2f32({float, float} %input, {float, float}* %output) { +; CHECK-LABEL: @test_s2f32 %call = tail call {float, float} @bars({float, float} %input) ; CHECK: .param .align 4 .b8 retval0[8]; ; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0+0]; Index: test/CodeGen/NVPTX/f16-instructions.ll =================================================================== --- test/CodeGen/NVPTX/f16-instructions.ll +++ test/CodeGen/NVPTX/f16-instructions.ll @@ -229,7 +229,7 @@ ; CHECK-LABEL: test_select( ; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0]; ; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1]; -; CHECK: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; +; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; ; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; ; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; ; CHECK-NEXT: ret; Index: test/CodeGen/NVPTX/ldparam-v4.ll =================================================================== --- test/CodeGen/NVPTX/ldparam-v4.ll +++ test/CodeGen/NVPTX/ldparam-v4.ll @@ -2,8 +2,11 @@ declare <4 x float> @bar() +; CHECK-LABEL: .func foo( define void @foo(<4 x float>* %ptr) { -; CHECK: ld.param.v4.f32 +; CHECK: ld.param.u32 %[[PTR:r[0-9]+]], [foo_param_0]; +; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0]; +; CHECK: st.v4.f32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]} %val = tail call <4 x float> @bar() store <4 x float> %val, <4 x float>* %ptr ret void Index: test/CodeGen/NVPTX/lower-aggr-copies.ll =================================================================== --- test/CodeGen/NVPTX/lower-aggr-copies.ll +++ test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to @@ -27,9 +27,9 @@ ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: ld.u8 %rs[[REG:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] } define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 { @@ -45,9 +45,9 @@ ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: ld.volatile.u8 %rs[[REG:[0-9]+]] ; PTX: st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] } define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 { @@ -78,12 +78,13 @@ ; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]] ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memset_caller( -; PTX: ld.param.u8 %rs[[REG:[0-9]+]] +; PTX: ld.param.u32 %r[[C:[0-9]+]] +; PTX: cvt.u16.u32 %rs[[REG:[0-9]+]], %r[[C]]; ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] } define i8* @volatile_memset_caller(i8* %dst, i32 %c, i64 %n) #0 { @@ -118,7 +119,7 @@ ; PTX-NEXT: @%p[[SRC_GT_THAN_DST]] bra LBB[[FORWARD_BB:[0-9_]+]] ; -- this is the backwards copying BB ; PTX: @%p[[NEQ0]] bra LBB[[EXIT:[0-9_]+]] -; PTX: add.s64 %rd[[N]], %rd[[N]], -1 +; PTX: add.s64 %rd{{[0-9]}}, %rd{{[0-9]}}, -1 ; PTX: ld.u8 %rs[[ELEMENT:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]] ; -- this is the forwards copying BB @@ -126,7 +127,7 @@ ; PTX: @%p[[NEQ0]] bra LBB[[EXIT]] ; PTX: ld.u8 %rs[[ELEMENT2:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]] -; PTX: add.s64 %rd[[INDEX:[0-9]+]], %rd[[INDEX]], 1 +; PTX: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 1 ; -- exit block ; PTX: LBB[[EXIT]]: ; PTX-NEXT: st.param.b64 [func_retval0 Index: test/CodeGen/NVPTX/param-load-store.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/param-load-store.ll @@ -0,0 +1,813 @@ +; Verifies correctness of load/store of parameters and return values. +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s + +%s_i1 = type { i1 } +%s_i8 = type { i8 } +%s_i16 = type { i16 } +%s_half = type { half } +%s_i32 = type { i32 } +%s_float = type { float } +%s_i64 = type { i64 } +%s_f64 = type { double } + +; More complicated types. i64 is used to increase natural alignment +; requirement for the type. +%s_i32x4 = type { i32, i32, i32, i32, i64} +%s_i32f32 = type { i32, float, i32, float, i64} +%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64} +%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}> +%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]} +; All scalar parameters must be at least 32 bits in size. +; i1 is loaded/stored as i8. + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i1( +; CHECK-NEXT: .param .b32 test_i1_param_0 +; CHECK: ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0]; +; CHECK: and.b32 [[A:%r[0-9]+]], [[A8]], 1; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]] +; CHECK: .param .b32 retval0; +; CHECK: call.uni +; CHECK-NEXT: test_i1, +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i1 @test_i1(i1 %a) { + %r = tail call i1 @test_i1(i1 %a); + ret i1 %r; +} + +; Signed i1 is a somewhat special case. We only care about one bit and +; then us neg.s32 to convert it to 32-bit -1 if it's set. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i1s( +; CHECK-NEXT: .param .b32 test_i1s_param_0 +; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0]; +; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1; +; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1; +; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i1 @test_i1s(i1 signext %a) { + %r = tail call signext i1 @test_i1s(i1 signext %a); + ret i1 %r; +} + +; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment. +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v3i1( +; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4] +; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; +; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i1, +; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; +; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]} +; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i1> @test_v3i1(<3 x i1> %a) { + %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a); + ret <3 x i1> %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v4i1( +; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4] +; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK: test_v4i1, +; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}; +; CHECK-NEXT: ret; +define <4 x i1> @test_v4i1(<4 x i1> %a) { + %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a); + ret <4 x i1> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v5i1( +; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8] +; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; +; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i1, +; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i1> @test_v5i1(<5 x i1> %a) { + %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a); + ret <5 x i1> %r; +} + +; Unsigned i8 is loaded directly into 32-bit register. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i8( +; CHECK-NEXT: .param .b32 test_i8_param_0 +; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0]; +; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK: test_i8, +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i8 @test_i8(i8 %a) { + %r = tail call i8 @test_i8(i8 %a); + ret i8 %r; +} + +; signed i8 is loaded into 16-bit register which is then sign-extended to i32. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i8s( +; CHECK-NEXT: .param .b32 test_i8s_param_0 +; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0]; +; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK: test_i8s, +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ? +; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]]; +; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i8 @test_i8s(i8 signext %a) { + %r = tail call signext i8 @test_i8s(i8 signext %a); + ret i8 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v3i8( +; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] +; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2]; +; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0]; +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b8 [param0+2], [[E2]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i8, +; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; +; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i8> @test_v3i8(<3 x i8> %a) { + %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a); + ret <3 x i8> %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v4i8( +; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] +; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i8, +; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-NEXT: ret; +define <4 x i8> @test_v4i8(<4 x i8> %a) { + %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a); + ret <4 x i8> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v5i8( +; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8] +; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; +; CHECK-DAG ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i8, +; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i8> @test_v5i8(<5 x i8> %a) { + %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a); + ret <5 x i8> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i16( +; CHECK-NEXT: .param .b32 test_i16_param_0 +; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0]; +; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i16, +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i16 @test_i16(i16 %a) { + %r = tail call i16 @test_i16(i16 %a); + ret i16 %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i16s( +; CHECK-NEXT: .param .b32 test_i16s_param_0 +; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; +; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i16s, +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i16 @test_i16s(i16 signext %a) { + %r = tail call signext i16 @test_i16s(i16 signext %a); + ret i16 %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v3i16( +; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8] +; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; +; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b16 [param0+4], [[E2]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i16, +; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i16> @test_v3i16(<3 x i16> %a) { + %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a); + ret <3 x i16> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v4i16( +; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] +; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i16, +; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-NEXT: ret; +define <4 x i16> @test_v4i16(<4 x i16> %a) { + %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a); + ret <4 x i16> %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v5i16( +; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16] +; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; +; CHECK-DAG ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i16, +; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i16> @test_v5i16(<5 x i16> %a) { + %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a); + ret <5 x i16> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_half( +; CHECK-NEXT: .param .b32 test_half_param_0 +; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_half_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b16 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_half, +; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]] +; CHECK-NEXT: ret; +define half @test_half(half %a) { + %r = tail call half @test_half(half %a); + ret half %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i32( +; CHECK-NEXT: .param .b32 test_i32_param_0 +; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i32, +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i32 @test_i32(i32 %a) { + %r = tail call i32 @test_i32(i32 %a); + ret i32 %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v3i32( +; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16] +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b32 [param0+8], [[E2]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i32, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i32> @test_v3i32(<3 x i32> %a) { + %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a); + ret <3 x i32> %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v4i32( +; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16] +; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i32, +; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHCK-NEXT: ret; +define <4 x i32> @test_v4i32(<4 x i32> %a) { + %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a); + ret <4 x i32> %r; +} + +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v5i32( +; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32] +; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; +; CHECK-DAG ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i32, +; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i32> @test_v5i32(<5 x i32> %a) { + %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a); + ret <5 x i32> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_float( +; CHECK-NEXT: .param .b32 test_float_param_0 +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_float_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_float, +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define float @test_float(float %a) { + %r = tail call float @test_float(float %a); + ret float %r; +} + +; CHECK: .func (.param .b64 func_retval0) +; CHECK-LABEL: test_i64( +; CHECK-NEXT: .param .b64 test_i64_param_0 +; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0]; +; CHECK: .param .b64 param0; +; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: .param .b64 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i64, +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i64 @test_i64(i64 %a) { + %r = tail call i64 @test_i64(i64 %a); + ret i64 %r; +} + +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v3i64( +; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32] +; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b64 [param0+16], [[E2]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i64, +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i64> @test_v3i64(<3 x i64> %a) { + %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a); + ret <3 x i64> %r; +} + +; For i64 vector loads are limited by PTX to 2 elements. +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v4i64( +; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32] +; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i64, +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]}; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-NEXT: ret; +define <4 x i64> @test_v4i64(<4 x i64> %a) { + %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a); + ret <4 x i64> %r; +} + +; Aggregates, on the other hand, do not get extended. + +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) +; CHECK-LABEL: test_s_i1( +; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1] +; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i1, +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i1 @test_s_i1(%s_i1 %a) { + %r = tail call %s_i1 @test_s_i1(%s_i1 %a); + ret %s_i1 %r; +} + +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) +; CHECK-LABEL: test_s_i8( +; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1] +; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i8, +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i8 @test_s_i8(%s_i8 %a) { + %r = tail call %s_i8 @test_s_i8(%s_i8 %a); + ret %s_i8 %r; +} + +; CHECK: .func (.param .align 2 .b8 func_retval0[2]) +; CHECK-LABEL: test_s_i16( +; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2] +; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; +; CHECK: .param .align 2 .b8 param0[2]; +; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i16, +; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i16 @test_s_i16(%s_i16 %a) { + %r = tail call %s_i16 @test_s_i16(%s_i16 %a); + ret %s_i16 %r; +} + +; CHECK: .func (.param .align 2 .b8 func_retval0[2]) +; CHECK-LABEL: test_s_half( +; CHECK-NEXT: .param .align 2 .b8 test_s_half_param_0[2] +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_half_param_0]; +; CHECK: .param .align 2 .b8 param0[2]; +; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: call.uni +; CHECK-NEXT: test_s_half, +; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_half @test_s_half(%s_half %a) { + %r = tail call %s_half @test_s_half(%s_half %a); + ret %s_half %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_s_i32( +; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4] +; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0]; +; CHECK: .param .align 4 .b8 param0[4] +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32, +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i32 @test_s_i32(%s_i32 %a) { + %r = tail call %s_i32 @test_s_i32(%s_i32 %a); + ret %s_i32 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_s_float( +; CHECK-NEXT: .param .align 4 .b8 test_s_float_param_0[4] +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_float_param_0]; +; CHECK: .param .align 4 .b8 param0[4] +; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_float, +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_float @test_s_float(%s_float %a) { + %r = tail call %s_float @test_s_float(%s_float %a); + ret %s_float %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_s_i64( +; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8] +; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i64, +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i64 @test_s_i64(%s_i64 %a) { + %r = tail call %s_i64 @test_s_i64(%s_i64 %a); + ret %s_i64 %r; +} + +; Fields that have different types, but identical sizes are not vectorized. +; CHECK: .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i32f32( +; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24] +; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16]; +; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12]; +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8]; +; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4]; +; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; +; CHECK: .param .align 8 .b8 param0[24]; +; CHECK-DAG: st.param.b32 [param0+0], [[E0]]; +; CHECK-DAG: st.param.f32 [param0+4], [[E1]]; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; +; CHECK-DAG: st.param.f32 [param0+12], [[E3]]; +; CHECK-DAG: st.param.b64 [param0+16], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32f32, +; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4]; +; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12]; +; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]]; +; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]]; +; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]]; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; +; CHECK: ret; +define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { + %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a); + ret %s_i32f32 %r; +} + +; We do vectorize consecutive fields with matching types. +; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i32x4( +; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24] +; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16]; +; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; +; CHECK: .param .align 8 .b8 param0[24]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; +; CHECK: st.param.b64 [param0+16], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32x4, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8]; +; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; +; CHECK: ret; + +define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { + %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a); + ret %s_i32x4 %r; +} + +; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32]) +; CHECK-LABEL: test_s_i1i32x4( +; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32] +; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24]; +; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16]; +; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12]; +; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; +; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; +; CHECK: .param .align 8 .b8 param0[32]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b8 [param0+8], [[E2]]; +; CHECK: st.param.b32 [param0+12], [[E3]]; +; CHECK: st.param.b32 [param0+16], [[E4]]; +; CHECK: st.param.b64 [param0+24], [[E5]]; +; CHECK: .param .align 8 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK: test_s_i1i32x4, +; CHECK: ( +; CHECK: param0 +; CHECK: ); +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8]; +; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12]; +; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; +; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.b8 [func_retval0+8], [[RE2]]; +; CHECK: st.param.b32 [func_retval0+12], [[RE3]]; +; CHECK: st.param.b32 [func_retval0+16], [[RE4]]; +; CHECK: st.param.b64 [func_retval0+24], [[RE5]]; +; CHECK: ret; + +define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { + %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a); + ret %s_i8i32x4 %r; +} + +; -- All loads/stores from parameters aligned by one must be done one +; -- byte at a time. +; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25]) +; CHECK-LABEL: test_s_i1i32x4p( +; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25] +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0]; +; --- TODO +; --- Unaligned parameter store/ return value load is broken in both nvcc +; --- and llvm and needs to be fixed. +; CHECK: .param .align 1 .b8 param0[25]; +; CHECK-DAG: st.param.b32 [param0+0], +; CHECK-DAG: st.param.b32 [param0+4], +; CHECK-DAG: st.param.b8 [param0+8], +; CHECK-DAG: st.param.b32 [param0+9], +; CHECK-DAG: st.param.b32 [param0+13], +; CHECK-DAG: st.param.b64 [param0+17], +; CHECK: .param .align 1 .b8 retval0[25]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i1i32x4p, +; CHECK-DAG: ld.param.b32 %r41, [retval0+0]; +; CHECK-DAG: ld.param.b32 %r42, [retval0+4]; +; CHECK-DAG: ld.param.b8 %rs2, [retval0+8]; +; CHECK-DAG: ld.param.b32 %r43, [retval0+9]; +; CHECK-DAG: ld.param.b32 %r44, [retval0+13]; +; CHECK-DAG: ld.param.b64 %rd23, [retval0+17]; +; CHECK-DAG: st.param.b32 [func_retval0+0], +; CHECK-DAG: st.param.b32 [func_retval0+4], +; CHECK-DAG: st.param.b8 [func_retval0+8], +; CHECK-DAG: st.param.b32 [func_retval0+9], +; CHECK-DAG: st.param.b32 [func_retval0+13], +; CHECK-DAG: st.param.b64 [func_retval0+17], + +define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { + %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a); + ret %s_i8i32x4p %r; +} + +; Check that we can vectorize loads that span multiple aggregate fields. +; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80]) +; CHECK-LABEL: test_s_crossfield( +; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80] +; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64]; +; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48]; +; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32]; +; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16]; +; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; +; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; +; CHECK: .param .align 16 .b8 param0[80]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b32 [param0+8], [[E2]]; +; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]}; +; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]}; +; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]}; +; CHECK: st.param.b32 [param0+64], [[E15]]; +; CHECK: .param .align 16 .b8 retval0[80]; +; CHECK: call.uni (retval0), +; CHECK: test_s_crossfield, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16]; +; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32]; +; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48]; +; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]}; +; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]}; +; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]}; +; CHECK: st.param.b32 [func_retval0+64], [[RE15]]; +; CHECK: ret; + +define %s_crossfield @test_s_crossfield(%s_crossfield %a) { + %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a); + ret %s_crossfield %r; +} Index: test/CodeGen/NVPTX/vec-param-load.ll =================================================================== --- test/CodeGen/NVPTX/vec-param-load.ll +++ test/CodeGen/NVPTX/vec-param-load.ll @@ -2,12 +2,81 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" - -define <16 x float> @foo(<16 x float> %a) { -; Make sure we index into vectors properly -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+48]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+32]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+16]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0]; +define <16 x float> @test_v16f32(<16 x float> %a) { +; CHECK-LABEL: test_v16f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48]; +; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32]; +; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16]; +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+32], {[[V_8_11]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+48], {[[V_12_15]]} +; CHECK: ret; ret <16 x float> %a } + +define <8 x float> @test_v8f32(<8 x float> %a) { +; CHECK-LABEL: test_v8f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16]; +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} +; CHECK: ret; + ret <8 x float> %a +} + +define <4 x float> @test_v4f32(<4 x float> %a) { +; CHECK-LABEL: test_v4f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK: ret; + ret <4 x float> %a +} + +define <2 x float> @test_v2f32(<2 x float> %a) { +; CHECK-LABEL: test_v2f32( +; CHECK-DAG: ld.param.v2.f32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0]; +; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK: ret; + ret <2 x float> %a +} + +; Oddly shaped vectors should not load any extra elements. +define <3 x float> @test_v3f32(<3 x float> %a) { +; CHECK-LABEL: test_v3f32( +; CHECK-DAG: ld.param.f32 [[V_2:%f[0-9]+]], [test_v3f32_param_0+8]; +; CHECK-DAG: ld.param.v2.f32 {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0]; +; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_1]]} +; CHECK-DAG: st.param.f32 [func_retval0+8], [[V_2]] +; CHECK: ret; + ret <3 x float> %a +} + +define <8 x i64> @test_v8i64(<8 x i64> %a) { +; CHECK-LABEL: test_v8i64( +; CHECK-DAG: ld.param.v2.u64 {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48]; +; CHECK-DAG: ld.param.v2.u64 {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32]; +; CHECK-DAG: ld.param.v2.u64 {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[V_0_1]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_2_3]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_4_5]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_6_7]]} +; CHECK: ret; + ret <8 x i64> %a +} + +define <16 x i16> @test_v16i16(<16 x i16> %a) { +; CHECK-LABEL: test_v16i16( +; CHECK-DAG: ld.param.v4.u16 {[[V_12_15:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+24]; +; CHECK-DAG: ld.param.v4.u16 {[[V_8_11:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16]; +; CHECK-DAG: ld.param.v4.u16 {[[V_4_7:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+8]; +; CHECK-DAG: ld.param.v4.u16 {[[V_0_3:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[V_4_7]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+16], {[[V_8_11]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+24], {[[V_12_15]]} +; CHECK: ret; + ret <16 x i16> %a +} Index: test/CodeGen/NVPTX/vec8.ll =================================================================== --- test/CodeGen/NVPTX/vec8.ll +++ test/CodeGen/NVPTX/vec8.ll @@ -4,10 +4,15 @@ ; CHECK: .visible .func foo define void @foo(<8 x i8> %a, i8* %b) { - %t0 = extractelement <8 x i8> %a, i32 0 -; CHECK-DAG: ld.param.v4.u8 -; CHECK-DAG: ld.param.u32 - store i8 %t0, i8* %b +; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0] +; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4] +; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1] +; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; +; CHECK: st.u8 [%[[B]]], [[T]]; + %t0 = extractelement <8 x i8> %a, i32 1 + %t1 = extractelement <8 x i8> %a, i32 6 + %t = add i8 %t0, %t1 + store i8 %t, i8* %b ret void } Index: test/CodeGen/NVPTX/vector-call.ll =================================================================== --- test/CodeGen/NVPTX/vector-call.ll +++ test/CodeGen/NVPTX/vector-call.ll @@ -4,9 +4,27 @@ declare void @bar(<4 x i32>) -; CHECK-LABEL: @foo +; CHECK-LABEL: .func foo( +; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: call.uni +; CHECK: ret; define void @foo(<4 x i32> %a) { -; CHECK: st.param.v4.b32 tail call void @bar(<4 x i32> %a) ret void } + +; CHECK-LABEL: .func foo3( +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0]; +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; +; CHECK: call.uni +; CHECK: ret; +declare void @bar3(<3 x i32>) +define void @foo3(<3 x i32> %a) { + tail call void @bar3(<3 x i32> %a) + ret void +}