Index: llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -138,6 +138,17 @@ Visiting.erase(GV); } +// PTX ABI requires all scalar argument/return values to have +// bit-size as a power of two of at least 32 bits. +static unsigned promoteScalarArgumentSize(unsigned size) { + if (size <= 32) + return 32; + else if (size <= 64) + return 64; + else + return size; +} + void NVPTXAsmPrinter::emitInstruction(const MachineInstr *MI) { MCInst Inst; lowerToMCInst(MI, Inst); @@ -351,8 +362,7 @@ // PTX ABI requires all scalar return values to be at least 32 // bits in size. fp16 normally uses .b16 as its storage type in // PTX, so its size must be adjusted here, too. - if (size < 32) - size = 32; + size = promoteScalarArgumentSize(size); O << ".param .b" << size << " func_retval0"; } else if (isa(Ty)) { @@ -381,8 +391,8 @@ for (unsigned j = 0, je = elems; j != je; ++j) { unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 32)) - sz = 32; + if (elemtype.isInteger()) + sz = promoteScalarArgumentSize(sz); O << ".reg .b" << sz << " func_retval" << idx; if (j < je - 1) O << ", "; @@ -1491,8 +1501,7 @@ unsigned sz = 0; if (isa(Ty)) { sz = cast(Ty)->getBitWidth(); - if (sz < 32) - sz = 32; + sz = promoteScalarArgumentSize(sz); } else if (isa(Ty)) sz = thePointerTy.getSizeInBits(); else if (Ty->isHalfTy()) @@ -1556,8 +1565,8 @@ for (unsigned j = 0, je = elems; j != je; ++j) { unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 32)) - sz = 32; + if (elemtype.isInteger()) + sz = promoteScalarArgumentSize(sz); O << "\t.reg .b" << sz << " "; printParamName(I, paramIndex, O); if (j < je - 1) Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -206,6 +206,39 @@ } } +/// PromoteScalarIntegerPTX +/// Used to make sure the arguments/returns are suitable for passing +/// and promote them to a larger size if they're not. +/// +/// The promoted type is placed in \p PromoteVT if the function returns true. +static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) { + if (VT.isScalarInteger()) { + auto n = VT.getFixedSizeInBits(); + if (n >= 8 && isPowerOf2_32(n)) + return false; + switch (PowerOf2Ceil(n)) { + default: + return false; // Covers i1 and integers larger than i64 + case 2: + case 4: + case 8: + *PromotedVT = MVT::i8; + break; + case 16: + *PromotedVT = MVT::i16; + break; + case 32: + *PromotedVT = MVT::i32; + break; + case 64: + *PromotedVT = MVT::i64; + break; + } + return true; + } + return false; +} + // Check whether we can merge loads/stores of some of the pieces of a // flattened function parameter or return value into a single vector // load/store. @@ -1293,6 +1326,8 @@ // PTX, so its size must be adjusted here, too. if (size < 32) size = 32; + else if (size > 32 && size < 64) + size = 64; O << ".param .b" << size << " _"; } else if (isa(retTy)) { @@ -1345,6 +1380,8 @@ sz = cast(Ty)->getBitWidth(); if (sz < 32) sz = 32; + else if (sz > 32 && sz < 64) + sz = 64; } else if (isa(Ty)) { sz = PtrVT.getSizeInBits(); } else if (Ty->isHalfTy()) @@ -1520,6 +1557,8 @@ // size. FP16 is loaded/stored using i16, so it's handled // here as well. TypeSize = 4; + } else if (VT.isInteger() && TypeSize > 4 && TypeSize < 8) { + TypeSize = 8; } SDValue DeclareScalarParamOps[] = { Chain, DAG.getConstant(ParamCount, dl, MVT::i32), @@ -1556,6 +1595,15 @@ } SDValue StVal = OutVals[OIdx]; + + MVT PromotedVT; + if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { + llvm::ISD::NodeType Ext = + Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + StVal = DAG.getNode(Ext, dl, PromotedVT, StVal); + EltVT = EVT(PromotedVT); + } + if (IsByVal) { auto PtrVT = getPointerTy(DL); SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal, @@ -1641,6 +1689,8 @@ // Scalar needs to be at least 32bit wide if (resultsz < 32) resultsz = 32; + else if (resultsz > 32 && resultsz < 64) + resultsz = 64; SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), DAG.getConstant(resultsz, dl, MVT::i32), @@ -1778,6 +1828,14 @@ EVT TheLoadType = VTs[i]; EVT EltType = Ins[i].VT; Align EltAlign = commonAlignment(RetAlign, Offsets[i]); + MVT PromotedVT; + + if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) { + TheLoadType = EVT(PromotedVT); + EltType = EVT(PromotedVT); + needTruncate = true; + } + if (ExtendIntegerRetVal) { TheLoadType = MVT::i32; EltType = MVT::i32; @@ -2558,6 +2616,13 @@ // v2f16 was loaded as an i32. Now we must bitcast it back. else if (EltVT == MVT::v2f16) Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); + + // If a promoted integer type is used, truncate down to the original + MVT PromotedVT; + if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { + Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); + } + // Extend the element if necessary (e.g. an i8 is loaded // into an i16 register) if (Ins[InsIdx].VT.isInteger() && @@ -2627,11 +2692,24 @@ return Chain; const DataLayout &DL = DAG.getDataLayout(); + SmallVector PromotedOutVals; SmallVector VTs; SmallVector Offsets; ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); + for (unsigned i = 0, e = VTs.size(); i != e; ++i) { + SDValue PromotedOutVal = OutVals[i]; + MVT PromotedVT; + if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) { + llvm::ISD::NodeType Ext = + Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal); + VTs[i] = EVT(PromotedVT); + } + PromotedOutVals.push_back(PromotedOutVal); + } + auto VectorInfo = VectorizePTXValueVTs( VTs, Offsets, RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) @@ -2652,12 +2730,14 @@ StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); } - SDValue RetVal = OutVals[i]; + SDValue OutVal = OutVals[i]; + SDValue RetVal = PromotedOutVals[i]; + if (ExtendIntegerRetVal) { RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl, MVT::i32, RetVal); - } else if (RetVal.getValueSizeInBits() < 16) { + } else if (OutVal.getValueSizeInBits() < 16) { // Use 16-bit registers for small load-stores as it's the // smallest general purpose register size supported by NVPTX. RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); Index: llvm/test/CodeGen/NVPTX/non-pow-two-param.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/non-pow-two-param.ll @@ -0,0 +1,208 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +; CHECK: .visible .func (.param .b32 func_retval0) callee_i19 +; CHECK: .param .b32 callee_i19_param_0 +define i19 @callee_i19(i19 %a) { + %val = alloca i19, align 4 + store i19 %a, i19* %val, align 4 + %ret = load i19, i19* %val, align 1 +; CHECK: ld.param.u8 +; CHECK: ld.param.u16 +; CHECK: st.param.b32 + ret i19 %ret +} + +; CHECK: .visible .func caller_i19 +define void @caller_i19(i19* %a) { + %val = load i19, i19* %a + %ret = call i19 @callee_i19(i19 %val) +; CHECK: ld.param.b32 +; CHECK: st.u16 +; CHECK: st.u8 + store i19 %ret, i19* %a + ret void +} + + +; CHECK: .visible .func (.param .b32 func_retval0) callee_i24 +; CHECK: .param .b32 callee_i24_param_0 +define i24 @callee_i24(i24 %a) { + %val = alloca i24, align 4 + store i24 %a, i24* %val, align 4 + %ret = load i24, i24* %val, align 1 +; CHECK: ld.param.u8 +; CHECK: ld.param.u16 +; CHECK: st.param.b32 + ret i24 %ret +} + +; CHECK: .visible .func caller_i24 +define void @caller_i24(i24* %a) { + %val = load i24, i24* %a + %ret = call i24 @callee_i24(i24 %val) +; CHECK: ld.param.b32 +; CHECK: st.u16 +; CHECK: st.u8 + store i24 %ret, i24* %a + ret void +} + + +; CHECK: .visible .func (.param .b32 func_retval0) callee_i29 +; CHECK: .param .b32 callee_i29_param_0 +define i29 @callee_i29(i29 %a) { + %val = alloca i29, align 4 + store i29 %a, i29* %val, align 4 + %ret = load i29, i29* %val, align 1 +; CHECK: ld.param.u32 +; CHECK: st.param.b32 + ret i29 %ret +} + +; CHECK: .visible .func caller_i29 +define void @caller_i29(i29* %a) { + %val = load i29, i29* %a + %ret = call i29 @callee_i29(i29 %val) +; CHECK: ld.param.b32 +; CHECK: and.b32 {{%r[0-9]+}}, {{%r[0-9]+}}, 536870911; +; CHECK: st.u32 + store i29 %ret, i29* %a + ret void +} + + +; CHECK: .visible .func (.param .b64 func_retval0) callee_i40 +; CHECK: .param .b64 callee_i40_param_0 +define i40 @callee_i40(i40 %a) { + %val = alloca i40, align 8 + store i40 %a, i40* %val, align 8 + %ret = load i40, i40* %val, align 1 +; CHECK: ld.param.u8 +; CHECK: ld.param.u32 +; CHECK: st.param.b64 + ret i40 %ret +} + +; CHECK: .visible .func caller_i40 +define void @caller_i40(i40* %a) { + %val = load i40, i40* %a + %ret = call i40 @callee_i40(i40 %val) +; CHECK: ld.param.b64 +; CHECK: st.u32 +; CHECK: st.u8 + store i40 %ret, i40* %a + ret void +} + + +; CHECK: .visible .func (.param .b64 func_retval0) callee_i48 +; CHECK: .param .b64 callee_i48_param_0 +define i48 @callee_i48(i48 %a) { + %val = alloca i48, align 8 + store i48 %a, i48* %val, align 8 + %ret = load i48, i48* %val, align 1 +; CHECK: ld.param.u16 +; CHECK: ld.param.u32 +; CHECK: st.param.b64 + ret i48 %ret +} + +; CHECK: .visible .func caller_i48 +define void @caller_i48(i48* %a) { + %val = load i48, i48* %a + %ret = call i48 @callee_i48(i48 %val) +; CHECK: ld.param.b64 +; CHECK: st.u32 +; CHECK: st.u16 + store i48 %ret, i48* %a + ret void +} + + +; CHECK: .visible .func (.param .b64 func_retval0) callee_i51 +; CHECK: .param .b64 callee_i51_param_0 +define i51 @callee_i51(i51 %a) { + %val = alloca i51, align 8 + store i51 %a, i51* %val, align 8 + %ret = load i51, i51* %val, align 1 +; CHECK: ld.param.u8 +; CHECK: shl.b64 +; CHECK: ld.param.u16 +; CHECK: or.b64 +; CHECK: shl.b64 +; CHECK: ld.param.u32 +; CHECK: or.b64 +; CHECK: add.u32 +; CHECK: st.local.u16 +; CHECK: st.local.u32 +; CHECK: bfe.u64 +; CHECK: st.local.u8 +; CHECK: st.param.b64 [func_retval0+0] + ret i51 %ret +} + +; CHECK: .visible .func caller_i51 +define void @caller_i51(i51* %a) { + %val = load i51, i51* %a + %ret = call i51 @callee_i51(i51 %val) +; CHECK: ld.param.b64 +; CHECK: st.u32 +; CHECK: bfe.u64 +; CHECK: st.u8 +; CHECK: shr.u64 +; CHECK: st.u16 + store i51 %ret, i51* %a + ret void +} + + +; CHECK: .visible .func (.param .b64 func_retval0) callee_i56 +; CHECK: .param .b64 callee_i56_param_0 +define i56 @callee_i56(i56 %a) { + %val = alloca i56, align 8 + store i56 %a, i56* %val, align 8 + %ret = load i56, i56* %val, align 1 +; CHECK: ld.param.u8 +; CHECK: ld.param.u16 +; CHECK: ld.param.u32 +; CHECK: st.param.b64 + ret i56 %ret +} + +; CHECK: .visible .func caller_i56 +define void @caller_i56(i56* %a) { + %val = load i56, i56* %a + %ret = call i56 @callee_i56(i56 %val) +; CHECK: ld.param.b64 +; CHECK: st.u32 +; CHECK: st.u8 +; CHECK: st.u16 + store i56 %ret, i56* %a + ret void +} + + +; CHECK: .visible .func (.param .b64 func_retval0) callee_i57 +; CHECK: .param .b64 callee_i57_param_0 +define i57 @callee_i57(i57 %a) { + %val = alloca i57, align 8 + store i57 %a, i57* %val, align 8 + %ret = load i57, i57* %val, align 1 +; CHECK: ld.param.u64 +; CHECK: st.param.b64 + ret i57 %ret +} + +; CHECK: .visible .func caller_i57 +define void @caller_i57(i57* %a) { + %val = load i57, i57* %a + %ret = call i57 @callee_i57(i57 %val) +; CHECK: ld.param.b64 +; CHECK: and.b64 {{%rd[0-9]+}}, {{%rd[0-9]+}}, 144115188075855871; +; CHECK: st.u64 + store i57 %ret, i57* %a + ret void +}