Index: llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1,4661 +1,4662 @@ -//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the interfaces that NVPTX uses to lower LLVM code into a -// selection DAG. -// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/NVPTXBaseInfo.h" -#include "NVPTX.h" -#include "NVPTXISelLowering.h" -#include "NVPTXSection.h" -#include "NVPTXSubtarget.h" -#include "NVPTXTargetMachine.h" -#include "NVPTXTargetObjectFile.h" -#include "NVPTXUtilities.h" -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/CodeGen/Analysis.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/MachineValueType.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/SelectionDAGNodes.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/Argument.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/CallSite.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CodeGen.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetCallingConv.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include -#include -#include -#include -#include -#include -#include -#include - -#undef DEBUG_TYPE -#define DEBUG_TYPE "nvptx-lower" - -using namespace llvm; - -static unsigned int uniqueCallSite = 0; - -static cl::opt sched4reg( - "nvptx-sched4reg", - cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); - -static cl::opt -FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, - cl::desc("NVPTX Specific: FMA contraction (0: don't do it" - " 1: do it 2: do it aggressively"), - cl::init(2)); - -static cl::opt UsePrecDivF32( - "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, - cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" - " IEEE Compliant F32 div.rnd if available."), - cl::init(2)); - -static cl::opt UsePrecSqrtF32( - "nvptx-prec-sqrtf32", cl::Hidden, - cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), - cl::init(true)); - -static cl::opt FtzEnabled( - "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden, - cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."), - cl::init(false)); - -int NVPTXTargetLowering::getDivF32Level() const { - if (UsePrecDivF32.getNumOccurrences() > 0) { - // If nvptx-prec-div32=N is used on the command-line, always honor it - return UsePrecDivF32; - } else { - // Otherwise, use div.approx if fast math is enabled - if (getTargetMachine().Options.UnsafeFPMath) - return 0; - else - return 2; - } -} - -bool NVPTXTargetLowering::usePrecSqrtF32() const { - if (UsePrecSqrtF32.getNumOccurrences() > 0) { - // If nvptx-prec-sqrtf32 is used on the command-line, always honor it - return UsePrecSqrtF32; - } else { - // Otherwise, use sqrt.approx if fast math is enabled - return !getTargetMachine().Options.UnsafeFPMath; - } -} - -bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { - // TODO: Get rid of this flag; there can be only one way to do this. - if (FtzEnabled.getNumOccurrences() > 0) { - // If nvptx-f32ftz is used on the command-line, always honor it - return FtzEnabled; - } else { - const Function *F = MF.getFunction(); - // Otherwise, check for an nvptx-f32ftz attribute on the function - if (F->hasFnAttribute("nvptx-f32ftz")) - return F->getFnAttribute("nvptx-f32ftz").getValueAsString() == "true"; - else - return false; - } -} - -static bool IsPTXVectorType(MVT VT) { - switch (VT.SimpleTy) { - default: - return false; - case MVT::v2i1: - case MVT::v4i1: - case MVT::v2i8: - case MVT::v4i8: - case MVT::v2i16: - case MVT::v4i16: - case MVT::v2i32: - case MVT::v4i32: - case MVT::v2i64: - case MVT::v2f16: - case MVT::v4f16: - case MVT::v8f16: // <4 x f16x2> - case MVT::v2f32: - case MVT::v4f32: - case MVT::v2f64: - return true; - } -} - -/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive -/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors -/// into their primitive components. -/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the -/// same number of types as the Ins/Outs arrays in LowerFormalArguments, -/// LowerCall, and LowerReturn. -static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, - Type *Ty, SmallVectorImpl &ValueVTs, - SmallVectorImpl *Offsets = nullptr, - uint64_t StartingOffset = 0) { - SmallVector TempVTs; - SmallVector TempOffsets; - - ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); - for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { - EVT VT = TempVTs[i]; - uint64_t Off = TempOffsets[i]; - // Split vectors into individual elements, except for v2f16, which - // we will pass as a single scalar. - if (VT.isVector()) { - unsigned NumElts = VT.getVectorNumElements(); - EVT EltVT = VT.getVectorElementType(); - // Vectors with an even number of f16 elements will be passed to - // us as an array of v2f16 elements. We must match this so we - // stay in sync with Ins/Outs. - if (EltVT == MVT::f16 && NumElts % 2 == 0) { - EltVT = MVT::v2f16; - NumElts /= 2; - } - for (unsigned j = 0; j != NumElts; ++j) { - ValueVTs.push_back(EltVT); - if (Offsets) - Offsets->push_back(Off + j * EltVT.getStoreSize()); - } - } else { - ValueVTs.push_back(VT); - if (Offsets) - Offsets->push_back(Off); - } - } -} - -// Check whether we can merge loads/stores of some of the pieces of a -// flattened function parameter or return value into a single vector -// load/store. -// -// The flattened parameter is represented as a list of EVTs and -// offsets, and the whole structure is aligned to ParamAlignment. This -// function determines whether we can load/store pieces of the -// parameter starting at index Idx using a single vectorized op of -// size AccessSize. If so, it returns the number of param pieces -// covered by the vector op. Otherwise, it returns 1. -static unsigned CanMergeParamLoadStoresStartingAt( - unsigned Idx, uint32_t AccessSize, const SmallVectorImpl &ValueVTs, - const SmallVectorImpl &Offsets, unsigned ParamAlignment) { - assert(isPowerOf2_32(AccessSize) && "must be a power of 2!"); - - // Can't vectorize if param alignment is not sufficient. - if (AccessSize > ParamAlignment) - return 1; - // Can't vectorize if offset is not aligned. - if (Offsets[Idx] & (AccessSize - 1)) - return 1; - - EVT EltVT = ValueVTs[Idx]; - unsigned EltSize = EltVT.getStoreSize(); - - // Element is too large to vectorize. - if (EltSize >= AccessSize) - return 1; - - unsigned NumElts = AccessSize / EltSize; - // Can't vectorize if AccessBytes if not a multiple of EltSize. - if (AccessSize != EltSize * NumElts) - return 1; - - // We don't have enough elements to vectorize. - if (Idx + NumElts > ValueVTs.size()) - return 1; - - // PTX ISA can only deal with 2- and 4-element vector ops. - if (NumElts != 4 && NumElts != 2) - return 1; - - for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { - // Types do not match. - if (ValueVTs[j] != EltVT) - return 1; - - // Elements are not contiguous. - if (Offsets[j] - Offsets[j - 1] != EltSize) - return 1; - } - // OK. We can vectorize ValueVTs[i..i+NumElts) - return NumElts; -} - -// Flags for tracking per-element vectorization state of loads/stores -// of a flattened function parameter or return value. -enum ParamVectorizationFlags { - PVF_INNER = 0x0, // Middle elements of a vector. - PVF_FIRST = 0x1, // First element of the vector. - PVF_LAST = 0x2, // Last element of the vector. - // Scalar is effectively a 1-element vector. - PVF_SCALAR = PVF_FIRST | PVF_LAST -}; - -// Computes whether and how we can vectorize the loads/stores of a -// flattened function parameter or return value. -// -// The flattened parameter is represented as the list of ValueVTs and -// Offsets, and is aligned to ParamAlignment bytes. We return a vector -// of the same size as ValueVTs indicating how each piece should be -// loaded/stored (i.e. as a scalar, or as part of a vector -// load/store). -static SmallVector -VectorizePTXValueVTs(const SmallVectorImpl &ValueVTs, - const SmallVectorImpl &Offsets, - unsigned ParamAlignment) { - // Set vector size to match ValueVTs and mark all elements as - // scalars by default. - SmallVector VectorInfo; - VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); - - // Check what we can vectorize using 128/64/32-bit accesses. - for (int I = 0, E = ValueVTs.size(); I != E; ++I) { - // Skip elements we've already processed. - assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); - for (unsigned AccessSize : {16, 8, 4, 2}) { - unsigned NumElts = CanMergeParamLoadStoresStartingAt( - I, AccessSize, ValueVTs, Offsets, ParamAlignment); - // Mark vectorized elements. - switch (NumElts) { - default: - llvm_unreachable("Unexpected return value"); - case 1: - // Can't vectorize using this size, try next smaller size. - continue; - case 2: - assert(I + 1 < E && "Not enough elements."); - VectorInfo[I] = PVF_FIRST; - VectorInfo[I + 1] = PVF_LAST; - I += 1; - break; - case 4: - assert(I + 3 < E && "Not enough elements."); - VectorInfo[I] = PVF_FIRST; - VectorInfo[I + 1] = PVF_INNER; - VectorInfo[I + 2] = PVF_INNER; - VectorInfo[I + 3] = PVF_LAST; - I += 3; - break; - } - // Break out of the inner loop because we've already succeeded - // using largest possible AccessSize. - break; - } - } - return VectorInfo; -} - -// NVPTXTargetLowering Constructor. -NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, - const NVPTXSubtarget &STI) - : TargetLowering(TM), nvTM(&TM), STI(STI) { - // always lower memset, memcpy, and memmove intrinsics to load/store - // instructions, rather - // then generating calls to memset, mempcy or memmove. - MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; - MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; - MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; - - setBooleanContents(ZeroOrNegativeOneBooleanContent); - setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); - - // Jump is Expensive. Don't create extra control flow for 'and', 'or' - // condition branches. - setJumpIsExpensive(true); - - // Wide divides are _very_ slow. Try to reduce the width of the divide if - // possible. - addBypassSlowDiv(64, 32); - - // By default, use the Source scheduling - if (sched4reg) - setSchedulingPreference(Sched::RegPressure); - else - setSchedulingPreference(Sched::Source); - - auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, - LegalizeAction NoF16Action) { - setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); - }; - - addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); - addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); - addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); - addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); - addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); - addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); - addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass); - addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass); - - // Conversion to/from FP16/FP16x2 is always legal. - setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); - - setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); - setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); - - // Operations not directly supported by NVPTX. - setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); - setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand); - setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i8, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - setOperationAction(ISD::BR_CC, MVT::f16, Expand); - setOperationAction(ISD::BR_CC, MVT::v2f16, Expand); - setOperationAction(ISD::BR_CC, MVT::f32, Expand); - setOperationAction(ISD::BR_CC, MVT::f64, Expand); - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - setOperationAction(ISD::BR_CC, MVT::i8, Expand); - setOperationAction(ISD::BR_CC, MVT::i16, Expand); - setOperationAction(ISD::BR_CC, MVT::i32, Expand); - setOperationAction(ISD::BR_CC, MVT::i64, Expand); - // Some SIGN_EXTEND_INREG can be done using cvt instruction. - // For others we will expand to a SHL/SRA pair. - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - - setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); - setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); - setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); - setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); - setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); - setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); - - setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); - setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); - - if (STI.hasROT64()) { - setOperationAction(ISD::ROTL, MVT::i64, Legal); - setOperationAction(ISD::ROTR, MVT::i64, Legal); - } else { - setOperationAction(ISD::ROTL, MVT::i64, Expand); - setOperationAction(ISD::ROTR, MVT::i64, Expand); - } - if (STI.hasROT32()) { - setOperationAction(ISD::ROTL, MVT::i32, Legal); - setOperationAction(ISD::ROTR, MVT::i32, Legal); - } else { - setOperationAction(ISD::ROTL, MVT::i32, Expand); - setOperationAction(ISD::ROTR, MVT::i32, Expand); - } - - setOperationAction(ISD::ROTL, MVT::i16, Expand); - setOperationAction(ISD::ROTR, MVT::i16, Expand); - setOperationAction(ISD::ROTL, MVT::i8, Expand); - setOperationAction(ISD::ROTR, MVT::i8, Expand); - setOperationAction(ISD::BSWAP, MVT::i16, Expand); - setOperationAction(ISD::BSWAP, MVT::i32, Expand); - setOperationAction(ISD::BSWAP, MVT::i64, Expand); - - // Indirect branch is not supported. - // This also disables Jump Table creation. - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - setOperationAction(ISD::BRIND, MVT::Other, Expand); - - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - - // We want to legalize constant related memmove and memcopy - // intrinsics. - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - - // Turn FP extload into load/fpextend - setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); - // Turn FP truncstore into trunc + store. - // FIXME: vector types should also be expanded - setTruncStoreAction(MVT::f32, MVT::f16, Expand); - setTruncStoreAction(MVT::f64, MVT::f16, Expand); - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - - // PTX does not support load / store predicate registers - setOperationAction(ISD::LOAD, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::i1, Custom); - - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setTruncStoreAction(VT, MVT::i1, Expand); - } - - // This is legal in NVPTX - setOperationAction(ISD::ConstantFP, MVT::f64, Legal); - setOperationAction(ISD::ConstantFP, MVT::f32, Legal); - setOperationAction(ISD::ConstantFP, MVT::f16, Legal); - - // TRAP can be lowered to PTX trap - setOperationAction(ISD::TRAP, MVT::Other, Legal); - - setOperationAction(ISD::ADDC, MVT::i64, Expand); - setOperationAction(ISD::ADDE, MVT::i64, Expand); - - // Register custom handling for vector loads/stores - for (MVT VT : MVT::vector_valuetypes()) { - if (IsPTXVectorType(VT)) { - setOperationAction(ISD::LOAD, VT, Custom); - setOperationAction(ISD::STORE, VT, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); - } - } - - // Custom handling for i8 intrinsics - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); - - for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { - setOperationAction(ISD::SMIN, Ty, Legal); - setOperationAction(ISD::SMAX, Ty, Legal); - setOperationAction(ISD::UMIN, Ty, Legal); - setOperationAction(ISD::UMAX, Ty, Legal); - - setOperationAction(ISD::CTPOP, Ty, Legal); - setOperationAction(ISD::CTLZ, Ty, Legal); - } - - setOperationAction(ISD::CTTZ, MVT::i16, Expand); - setOperationAction(ISD::CTTZ, MVT::i32, Expand); - setOperationAction(ISD::CTTZ, MVT::i64, Expand); - - // PTX does not directly support SELP of i1, so promote to i32 first - setOperationAction(ISD::SELECT, MVT::i1, Custom); - - // PTX cannot multiply two i64s in a single instruction. - setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); - - // We have some custom DAG combine patterns for these nodes - setTargetDAGCombine(ISD::ADD); - setTargetDAGCombine(ISD::AND); - setTargetDAGCombine(ISD::FADD); - setTargetDAGCombine(ISD::MUL); - setTargetDAGCombine(ISD::SHL); - setTargetDAGCombine(ISD::SREM); - setTargetDAGCombine(ISD::UREM); - - // setcc for f16x2 needs special handling to prevent legalizer's - // attempt to scalarize it due to v2i1 not being legal. - if (STI.allowFP16Math()) - setTargetDAGCombine(ISD::SETCC); - - // Promote fp16 arithmetic if fp16 hardware isn't available or the - // user passed --nvptx-no-fp16-math. The flag is useful because, - // although sm_53+ GPUs have some sort of FP16 support in - // hardware, only sm_53 and sm_60 have full implementation. Others - // only have token amount of hardware and are likely to run faster - // by using fp32 units instead. - for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { - setFP16OperationAction(Op, MVT::f16, Legal, Promote); - setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); - } - - // There's no neg.f16 instruction. Expand to (0-x). - setOperationAction(ISD::FNEG, MVT::f16, Expand); - setOperationAction(ISD::FNEG, MVT::v2f16, Expand); - - // (would be) Library functions. - - // These map to conversion instructions for scalar FP types. - for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, - ISD::FROUND, ISD::FTRUNC}) { - setOperationAction(Op, MVT::f16, Legal); - setOperationAction(Op, MVT::f32, Legal); - setOperationAction(Op, MVT::f64, Legal); - setOperationAction(Op, MVT::v2f16, Expand); - } - - // 'Expand' implements FCOPYSIGN without calling an external library. - setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - - // These map to corresponding instructions for f32/f64. f16 must be - // promoted to f32. v2f16 is expanded to f16, which is then promoted - // to f32. - for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, - ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) { - setOperationAction(Op, MVT::f16, Promote); - setOperationAction(Op, MVT::f32, Legal); - setOperationAction(Op, MVT::f64, Legal); - setOperationAction(Op, MVT::v2f16, Expand); - } - setOperationAction(ISD::FMINNUM, MVT::f16, Promote); - setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); - setOperationAction(ISD::FMINNAN, MVT::f16, Promote); - setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); - - // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. - // No FPOW or FREM in PTX. - - // Now deduce the information based on the above mentioned - // actions - computeRegisterProperties(STI.getRegisterInfo()); -} - -const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch ((NVPTXISD::NodeType)Opcode) { - case NVPTXISD::FIRST_NUMBER: - break; - case NVPTXISD::CALL: - return "NVPTXISD::CALL"; - case NVPTXISD::RET_FLAG: - return "NVPTXISD::RET_FLAG"; - case NVPTXISD::LOAD_PARAM: - return "NVPTXISD::LOAD_PARAM"; - case NVPTXISD::Wrapper: - return "NVPTXISD::Wrapper"; - case NVPTXISD::DeclareParam: - return "NVPTXISD::DeclareParam"; - case NVPTXISD::DeclareScalarParam: - return "NVPTXISD::DeclareScalarParam"; - case NVPTXISD::DeclareRet: - return "NVPTXISD::DeclareRet"; - case NVPTXISD::DeclareScalarRet: - return "NVPTXISD::DeclareScalarRet"; - case NVPTXISD::DeclareRetParam: - return "NVPTXISD::DeclareRetParam"; - case NVPTXISD::PrintCall: - return "NVPTXISD::PrintCall"; - case NVPTXISD::PrintConvergentCall: - return "NVPTXISD::PrintConvergentCall"; - case NVPTXISD::PrintCallUni: - return "NVPTXISD::PrintCallUni"; - case NVPTXISD::PrintConvergentCallUni: - return "NVPTXISD::PrintConvergentCallUni"; - case NVPTXISD::LoadParam: - return "NVPTXISD::LoadParam"; - case NVPTXISD::LoadParamV2: - return "NVPTXISD::LoadParamV2"; - case NVPTXISD::LoadParamV4: - return "NVPTXISD::LoadParamV4"; - case NVPTXISD::StoreParam: - return "NVPTXISD::StoreParam"; - case NVPTXISD::StoreParamV2: - return "NVPTXISD::StoreParamV2"; - case NVPTXISD::StoreParamV4: - return "NVPTXISD::StoreParamV4"; - case NVPTXISD::StoreParamS32: - return "NVPTXISD::StoreParamS32"; - case NVPTXISD::StoreParamU32: - return "NVPTXISD::StoreParamU32"; - case NVPTXISD::CallArgBegin: - return "NVPTXISD::CallArgBegin"; - case NVPTXISD::CallArg: - return "NVPTXISD::CallArg"; - case NVPTXISD::LastCallArg: - return "NVPTXISD::LastCallArg"; - case NVPTXISD::CallArgEnd: - return "NVPTXISD::CallArgEnd"; - case NVPTXISD::CallVoid: - return "NVPTXISD::CallVoid"; - case NVPTXISD::CallVal: - return "NVPTXISD::CallVal"; - case NVPTXISD::CallSymbol: - return "NVPTXISD::CallSymbol"; - case NVPTXISD::Prototype: - return "NVPTXISD::Prototype"; - case NVPTXISD::MoveParam: - return "NVPTXISD::MoveParam"; - case NVPTXISD::StoreRetval: - return "NVPTXISD::StoreRetval"; - case NVPTXISD::StoreRetvalV2: - return "NVPTXISD::StoreRetvalV2"; - case NVPTXISD::StoreRetvalV4: - return "NVPTXISD::StoreRetvalV4"; - case NVPTXISD::PseudoUseParam: - return "NVPTXISD::PseudoUseParam"; - case NVPTXISD::RETURN: - return "NVPTXISD::RETURN"; - case NVPTXISD::CallSeqBegin: - return "NVPTXISD::CallSeqBegin"; - case NVPTXISD::CallSeqEnd: - return "NVPTXISD::CallSeqEnd"; - case NVPTXISD::CallPrototype: - return "NVPTXISD::CallPrototype"; - case NVPTXISD::LoadV2: - return "NVPTXISD::LoadV2"; - case NVPTXISD::LoadV4: - return "NVPTXISD::LoadV4"; - case NVPTXISD::LDGV2: - return "NVPTXISD::LDGV2"; - case NVPTXISD::LDGV4: - return "NVPTXISD::LDGV4"; - case NVPTXISD::LDUV2: - return "NVPTXISD::LDUV2"; - case NVPTXISD::LDUV4: - return "NVPTXISD::LDUV4"; - case NVPTXISD::StoreV2: - return "NVPTXISD::StoreV2"; - case NVPTXISD::StoreV4: - return "NVPTXISD::StoreV4"; - case NVPTXISD::FUN_SHFL_CLAMP: - return "NVPTXISD::FUN_SHFL_CLAMP"; - case NVPTXISD::FUN_SHFR_CLAMP: - return "NVPTXISD::FUN_SHFR_CLAMP"; - case NVPTXISD::IMAD: - return "NVPTXISD::IMAD"; - case NVPTXISD::SETP_F16X2: - return "NVPTXISD::SETP_F16X2"; - case NVPTXISD::Dummy: - return "NVPTXISD::Dummy"; - case NVPTXISD::MUL_WIDE_SIGNED: - return "NVPTXISD::MUL_WIDE_SIGNED"; - case NVPTXISD::MUL_WIDE_UNSIGNED: - return "NVPTXISD::MUL_WIDE_UNSIGNED"; - case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; - case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; - case NVPTXISD::Tex1DFloatFloatLevel: - return "NVPTXISD::Tex1DFloatFloatLevel"; - case NVPTXISD::Tex1DFloatFloatGrad: - return "NVPTXISD::Tex1DFloatFloatGrad"; - case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; - case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; - case NVPTXISD::Tex1DS32FloatLevel: - return "NVPTXISD::Tex1DS32FloatLevel"; - case NVPTXISD::Tex1DS32FloatGrad: - return "NVPTXISD::Tex1DS32FloatGrad"; - case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; - case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; - case NVPTXISD::Tex1DU32FloatLevel: - return "NVPTXISD::Tex1DU32FloatLevel"; - case NVPTXISD::Tex1DU32FloatGrad: - return "NVPTXISD::Tex1DU32FloatGrad"; - case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; - case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; - case NVPTXISD::Tex1DArrayFloatFloatLevel: - return "NVPTXISD::Tex1DArrayFloatFloatLevel"; - case NVPTXISD::Tex1DArrayFloatFloatGrad: - return "NVPTXISD::Tex1DArrayFloatFloatGrad"; - case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; - case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; - case NVPTXISD::Tex1DArrayS32FloatLevel: - return "NVPTXISD::Tex1DArrayS32FloatLevel"; - case NVPTXISD::Tex1DArrayS32FloatGrad: - return "NVPTXISD::Tex1DArrayS32FloatGrad"; - case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; - case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; - case NVPTXISD::Tex1DArrayU32FloatLevel: - return "NVPTXISD::Tex1DArrayU32FloatLevel"; - case NVPTXISD::Tex1DArrayU32FloatGrad: - return "NVPTXISD::Tex1DArrayU32FloatGrad"; - case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; - case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; - case NVPTXISD::Tex2DFloatFloatLevel: - return "NVPTXISD::Tex2DFloatFloatLevel"; - case NVPTXISD::Tex2DFloatFloatGrad: - return "NVPTXISD::Tex2DFloatFloatGrad"; - case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; - case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; - case NVPTXISD::Tex2DS32FloatLevel: - return "NVPTXISD::Tex2DS32FloatLevel"; - case NVPTXISD::Tex2DS32FloatGrad: - return "NVPTXISD::Tex2DS32FloatGrad"; - case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; - case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; - case NVPTXISD::Tex2DU32FloatLevel: - return "NVPTXISD::Tex2DU32FloatLevel"; - case NVPTXISD::Tex2DU32FloatGrad: - return "NVPTXISD::Tex2DU32FloatGrad"; - case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; - case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; - case NVPTXISD::Tex2DArrayFloatFloatLevel: - return "NVPTXISD::Tex2DArrayFloatFloatLevel"; - case NVPTXISD::Tex2DArrayFloatFloatGrad: - return "NVPTXISD::Tex2DArrayFloatFloatGrad"; - case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; - case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; - case NVPTXISD::Tex2DArrayS32FloatLevel: - return "NVPTXISD::Tex2DArrayS32FloatLevel"; - case NVPTXISD::Tex2DArrayS32FloatGrad: - return "NVPTXISD::Tex2DArrayS32FloatGrad"; - case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; - case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; - case NVPTXISD::Tex2DArrayU32FloatLevel: - return "NVPTXISD::Tex2DArrayU32FloatLevel"; - case NVPTXISD::Tex2DArrayU32FloatGrad: - return "NVPTXISD::Tex2DArrayU32FloatGrad"; - case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; - case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; - case NVPTXISD::Tex3DFloatFloatLevel: - return "NVPTXISD::Tex3DFloatFloatLevel"; - case NVPTXISD::Tex3DFloatFloatGrad: - return "NVPTXISD::Tex3DFloatFloatGrad"; - case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; - case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; - case NVPTXISD::Tex3DS32FloatLevel: - return "NVPTXISD::Tex3DS32FloatLevel"; - case NVPTXISD::Tex3DS32FloatGrad: - return "NVPTXISD::Tex3DS32FloatGrad"; - case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; - case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; - case NVPTXISD::Tex3DU32FloatLevel: - return "NVPTXISD::Tex3DU32FloatLevel"; - case NVPTXISD::Tex3DU32FloatGrad: - return "NVPTXISD::Tex3DU32FloatGrad"; - case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; - case NVPTXISD::TexCubeFloatFloatLevel: - return "NVPTXISD::TexCubeFloatFloatLevel"; - case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; - case NVPTXISD::TexCubeS32FloatLevel: - return "NVPTXISD::TexCubeS32FloatLevel"; - case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; - case NVPTXISD::TexCubeU32FloatLevel: - return "NVPTXISD::TexCubeU32FloatLevel"; - case NVPTXISD::TexCubeArrayFloatFloat: - return "NVPTXISD::TexCubeArrayFloatFloat"; - case NVPTXISD::TexCubeArrayFloatFloatLevel: - return "NVPTXISD::TexCubeArrayFloatFloatLevel"; - case NVPTXISD::TexCubeArrayS32Float: - return "NVPTXISD::TexCubeArrayS32Float"; - case NVPTXISD::TexCubeArrayS32FloatLevel: - return "NVPTXISD::TexCubeArrayS32FloatLevel"; - case NVPTXISD::TexCubeArrayU32Float: - return "NVPTXISD::TexCubeArrayU32Float"; - case NVPTXISD::TexCubeArrayU32FloatLevel: - return "NVPTXISD::TexCubeArrayU32FloatLevel"; - case NVPTXISD::Tld4R2DFloatFloat: - return "NVPTXISD::Tld4R2DFloatFloat"; - case NVPTXISD::Tld4G2DFloatFloat: - return "NVPTXISD::Tld4G2DFloatFloat"; - case NVPTXISD::Tld4B2DFloatFloat: - return "NVPTXISD::Tld4B2DFloatFloat"; - case NVPTXISD::Tld4A2DFloatFloat: - return "NVPTXISD::Tld4A2DFloatFloat"; - case NVPTXISD::Tld4R2DS64Float: - return "NVPTXISD::Tld4R2DS64Float"; - case NVPTXISD::Tld4G2DS64Float: - return "NVPTXISD::Tld4G2DS64Float"; - case NVPTXISD::Tld4B2DS64Float: - return "NVPTXISD::Tld4B2DS64Float"; - case NVPTXISD::Tld4A2DS64Float: - return "NVPTXISD::Tld4A2DS64Float"; - case NVPTXISD::Tld4R2DU64Float: - return "NVPTXISD::Tld4R2DU64Float"; - case NVPTXISD::Tld4G2DU64Float: - return "NVPTXISD::Tld4G2DU64Float"; - case NVPTXISD::Tld4B2DU64Float: - return "NVPTXISD::Tld4B2DU64Float"; - case NVPTXISD::Tld4A2DU64Float: - return "NVPTXISD::Tld4A2DU64Float"; - - case NVPTXISD::TexUnified1DFloatS32: - return "NVPTXISD::TexUnified1DFloatS32"; - case NVPTXISD::TexUnified1DFloatFloat: - return "NVPTXISD::TexUnified1DFloatFloat"; - case NVPTXISD::TexUnified1DFloatFloatLevel: - return "NVPTXISD::TexUnified1DFloatFloatLevel"; - case NVPTXISD::TexUnified1DFloatFloatGrad: - return "NVPTXISD::TexUnified1DFloatFloatGrad"; - case NVPTXISD::TexUnified1DS32S32: - return "NVPTXISD::TexUnified1DS32S32"; - case NVPTXISD::TexUnified1DS32Float: - return "NVPTXISD::TexUnified1DS32Float"; - case NVPTXISD::TexUnified1DS32FloatLevel: - return "NVPTXISD::TexUnified1DS32FloatLevel"; - case NVPTXISD::TexUnified1DS32FloatGrad: - return "NVPTXISD::TexUnified1DS32FloatGrad"; - case NVPTXISD::TexUnified1DU32S32: - return "NVPTXISD::TexUnified1DU32S32"; - case NVPTXISD::TexUnified1DU32Float: - return "NVPTXISD::TexUnified1DU32Float"; - case NVPTXISD::TexUnified1DU32FloatLevel: - return "NVPTXISD::TexUnified1DU32FloatLevel"; - case NVPTXISD::TexUnified1DU32FloatGrad: - return "NVPTXISD::TexUnified1DU32FloatGrad"; - case NVPTXISD::TexUnified1DArrayFloatS32: - return "NVPTXISD::TexUnified1DArrayFloatS32"; - case NVPTXISD::TexUnified1DArrayFloatFloat: - return "NVPTXISD::TexUnified1DArrayFloatFloat"; - case NVPTXISD::TexUnified1DArrayFloatFloatLevel: - return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; - case NVPTXISD::TexUnified1DArrayFloatFloatGrad: - return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; - case NVPTXISD::TexUnified1DArrayS32S32: - return "NVPTXISD::TexUnified1DArrayS32S32"; - case NVPTXISD::TexUnified1DArrayS32Float: - return "NVPTXISD::TexUnified1DArrayS32Float"; - case NVPTXISD::TexUnified1DArrayS32FloatLevel: - return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; - case NVPTXISD::TexUnified1DArrayS32FloatGrad: - return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; - case NVPTXISD::TexUnified1DArrayU32S32: - return "NVPTXISD::TexUnified1DArrayU32S32"; - case NVPTXISD::TexUnified1DArrayU32Float: - return "NVPTXISD::TexUnified1DArrayU32Float"; - case NVPTXISD::TexUnified1DArrayU32FloatLevel: - return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; - case NVPTXISD::TexUnified1DArrayU32FloatGrad: - return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; - case NVPTXISD::TexUnified2DFloatS32: - return "NVPTXISD::TexUnified2DFloatS32"; - case NVPTXISD::TexUnified2DFloatFloat: - return "NVPTXISD::TexUnified2DFloatFloat"; - case NVPTXISD::TexUnified2DFloatFloatLevel: - return "NVPTXISD::TexUnified2DFloatFloatLevel"; - case NVPTXISD::TexUnified2DFloatFloatGrad: - return "NVPTXISD::TexUnified2DFloatFloatGrad"; - case NVPTXISD::TexUnified2DS32S32: - return "NVPTXISD::TexUnified2DS32S32"; - case NVPTXISD::TexUnified2DS32Float: - return "NVPTXISD::TexUnified2DS32Float"; - case NVPTXISD::TexUnified2DS32FloatLevel: - return "NVPTXISD::TexUnified2DS32FloatLevel"; - case NVPTXISD::TexUnified2DS32FloatGrad: - return "NVPTXISD::TexUnified2DS32FloatGrad"; - case NVPTXISD::TexUnified2DU32S32: - return "NVPTXISD::TexUnified2DU32S32"; - case NVPTXISD::TexUnified2DU32Float: - return "NVPTXISD::TexUnified2DU32Float"; - case NVPTXISD::TexUnified2DU32FloatLevel: - return "NVPTXISD::TexUnified2DU32FloatLevel"; - case NVPTXISD::TexUnified2DU32FloatGrad: - return "NVPTXISD::TexUnified2DU32FloatGrad"; - case NVPTXISD::TexUnified2DArrayFloatS32: - return "NVPTXISD::TexUnified2DArrayFloatS32"; - case NVPTXISD::TexUnified2DArrayFloatFloat: - return "NVPTXISD::TexUnified2DArrayFloatFloat"; - case NVPTXISD::TexUnified2DArrayFloatFloatLevel: - return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; - case NVPTXISD::TexUnified2DArrayFloatFloatGrad: - return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; - case NVPTXISD::TexUnified2DArrayS32S32: - return "NVPTXISD::TexUnified2DArrayS32S32"; - case NVPTXISD::TexUnified2DArrayS32Float: - return "NVPTXISD::TexUnified2DArrayS32Float"; - case NVPTXISD::TexUnified2DArrayS32FloatLevel: - return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; - case NVPTXISD::TexUnified2DArrayS32FloatGrad: - return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; - case NVPTXISD::TexUnified2DArrayU32S32: - return "NVPTXISD::TexUnified2DArrayU32S32"; - case NVPTXISD::TexUnified2DArrayU32Float: - return "NVPTXISD::TexUnified2DArrayU32Float"; - case NVPTXISD::TexUnified2DArrayU32FloatLevel: - return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; - case NVPTXISD::TexUnified2DArrayU32FloatGrad: - return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; - case NVPTXISD::TexUnified3DFloatS32: - return "NVPTXISD::TexUnified3DFloatS32"; - case NVPTXISD::TexUnified3DFloatFloat: - return "NVPTXISD::TexUnified3DFloatFloat"; - case NVPTXISD::TexUnified3DFloatFloatLevel: - return "NVPTXISD::TexUnified3DFloatFloatLevel"; - case NVPTXISD::TexUnified3DFloatFloatGrad: - return "NVPTXISD::TexUnified3DFloatFloatGrad"; - case NVPTXISD::TexUnified3DS32S32: - return "NVPTXISD::TexUnified3DS32S32"; - case NVPTXISD::TexUnified3DS32Float: - return "NVPTXISD::TexUnified3DS32Float"; - case NVPTXISD::TexUnified3DS32FloatLevel: - return "NVPTXISD::TexUnified3DS32FloatLevel"; - case NVPTXISD::TexUnified3DS32FloatGrad: - return "NVPTXISD::TexUnified3DS32FloatGrad"; - case NVPTXISD::TexUnified3DU32S32: - return "NVPTXISD::TexUnified3DU32S32"; - case NVPTXISD::TexUnified3DU32Float: - return "NVPTXISD::TexUnified3DU32Float"; - case NVPTXISD::TexUnified3DU32FloatLevel: - return "NVPTXISD::TexUnified3DU32FloatLevel"; - case NVPTXISD::TexUnified3DU32FloatGrad: - return "NVPTXISD::TexUnified3DU32FloatGrad"; - case NVPTXISD::TexUnifiedCubeFloatFloat: - return "NVPTXISD::TexUnifiedCubeFloatFloat"; - case NVPTXISD::TexUnifiedCubeFloatFloatLevel: - return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; - case NVPTXISD::TexUnifiedCubeS32Float: - return "NVPTXISD::TexUnifiedCubeS32Float"; - case NVPTXISD::TexUnifiedCubeS32FloatLevel: - return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; - case NVPTXISD::TexUnifiedCubeU32Float: - return "NVPTXISD::TexUnifiedCubeU32Float"; - case NVPTXISD::TexUnifiedCubeU32FloatLevel: - return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; - case NVPTXISD::TexUnifiedCubeArrayFloatFloat: - return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; - case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: - return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; - case NVPTXISD::TexUnifiedCubeArrayS32Float: - return "NVPTXISD::TexUnifiedCubeArrayS32Float"; - case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: - return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; - case NVPTXISD::TexUnifiedCubeArrayU32Float: - return "NVPTXISD::TexUnifiedCubeArrayU32Float"; - case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: - return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; - case NVPTXISD::Tld4UnifiedR2DFloatFloat: - return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; - case NVPTXISD::Tld4UnifiedG2DFloatFloat: - return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; - case NVPTXISD::Tld4UnifiedB2DFloatFloat: - return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; - case NVPTXISD::Tld4UnifiedA2DFloatFloat: - return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; - case NVPTXISD::Tld4UnifiedR2DS64Float: - return "NVPTXISD::Tld4UnifiedR2DS64Float"; - case NVPTXISD::Tld4UnifiedG2DS64Float: - return "NVPTXISD::Tld4UnifiedG2DS64Float"; - case NVPTXISD::Tld4UnifiedB2DS64Float: - return "NVPTXISD::Tld4UnifiedB2DS64Float"; - case NVPTXISD::Tld4UnifiedA2DS64Float: - return "NVPTXISD::Tld4UnifiedA2DS64Float"; - case NVPTXISD::Tld4UnifiedR2DU64Float: - return "NVPTXISD::Tld4UnifiedR2DU64Float"; - case NVPTXISD::Tld4UnifiedG2DU64Float: - return "NVPTXISD::Tld4UnifiedG2DU64Float"; - case NVPTXISD::Tld4UnifiedB2DU64Float: - return "NVPTXISD::Tld4UnifiedB2DU64Float"; - case NVPTXISD::Tld4UnifiedA2DU64Float: - return "NVPTXISD::Tld4UnifiedA2DU64Float"; - - case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; - case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; - case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; - case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; - case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; - case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; - case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; - case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; - case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; - case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; - case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; - - case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; - case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; - case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; - case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; - case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; - case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; - case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; - case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; - case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; - case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; - case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; - - case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; - case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; - case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; - case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; - case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; - case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; - case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; - case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; - case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; - case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; - case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; - - case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; - case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; - case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; - case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; - case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; - case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; - case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; - case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; - case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; - case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; - case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; - - case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; - case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; - case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; - case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; - case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; - case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; - case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; - case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; - case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; - case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; - case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; - - case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; - case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; - case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; - case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; - case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; - case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; - case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; - case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; - case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; - case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; - case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; - - case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; - case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; - case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; - case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; - case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; - case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; - case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; - case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; - case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; - case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; - case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; - - case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; - case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; - case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; - case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; - case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; - case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; - case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; - case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; - case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; - case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; - case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; - - case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; - case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; - case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; - case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; - case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; - case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; - case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; - case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; - case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; - case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; - case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; - - case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; - case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; - case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; - case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; - case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; - case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; - case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; - case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; - case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; - case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; - case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; - - case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; - case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; - case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; - case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; - case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; - case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; - case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; - case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; - case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; - case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; - case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; - - case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; - case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; - case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; - case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; - case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; - case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; - case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; - case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; - case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; - case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; - case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; - - case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; - case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; - case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; - case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; - case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; - case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; - case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; - case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; - case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; - case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; - case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; - - case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; - case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; - case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; - case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; - case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; - case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; - case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; - case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; - case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; - case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; - case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; - - case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; - case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; - case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; - case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; - case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; - case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; - case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; - case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; - case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; - case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; - case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; - } - return nullptr; -} - -TargetLoweringBase::LegalizeTypeAction -NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const { - if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) - return TypeSplitVector; - if (VT == MVT::v2f16) - return TypeLegal; - return TargetLoweringBase::getPreferredVectorAction(VT); -} - -SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, - int Enabled, int &ExtraSteps, - bool &UseOneConst, - bool Reciprocal) const { - if (!(Enabled == ReciprocalEstimate::Enabled || - (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) - return SDValue(); - - if (ExtraSteps == ReciprocalEstimate::Unspecified) - ExtraSteps = 0; - - SDLoc DL(Operand); - EVT VT = Operand.getValueType(); - bool Ftz = useF32FTZ(DAG.getMachineFunction()); - - auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(IID, DL, MVT::i32), Operand); - }; - - // The sqrt and rsqrt refinement processes assume we always start out with an - // approximation of the rsqrt. Therefore, if we're going to do any refinement - // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing - // any refinement, we must return a regular sqrt. - if (Reciprocal || ExtraSteps > 0) { - if (VT == MVT::f32) - return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f - : Intrinsic::nvvm_rsqrt_approx_f); - else if (VT == MVT::f64) - return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); - else - return SDValue(); - } else { - if (VT == MVT::f32) - return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f - : Intrinsic::nvvm_sqrt_approx_f); - else { - // There's no sqrt.approx.f64 instruction, so we emit - // reciprocal(rsqrt(x)). This is faster than - // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain - // x * rsqrt(x).) - return DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), - MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); - } - } -} - -SDValue -NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - SDLoc dl(Op); - const GlobalValue *GV = cast(Op)->getGlobal(); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT); - return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); -} - -std::string NVPTXTargetLowering::getPrototype( - const DataLayout &DL, Type *retTy, const ArgListTy &Args, - const SmallVectorImpl &Outs, unsigned retAlignment, - const ImmutableCallSite *CS) const { - auto PtrVT = getPointerTy(DL); - - bool isABI = (STI.getSmVersion() >= 20); - assert(isABI && "Non-ABI compilation is not supported"); - if (!isABI) - return ""; - - std::stringstream O; - O << "prototype_" << uniqueCallSite << " : .callprototype "; - - if (retTy->getTypeID() == Type::VoidTyID) { - O << "()"; - } else { - O << "("; - if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) { - unsigned size = 0; - if (auto *ITy = dyn_cast(retTy)) { - size = ITy->getBitWidth(); - } else { - assert(retTy->isFloatingPointTy() && - "Floating point type expected here"); - size = retTy->getPrimitiveSizeInBits(); - } - // PTX ABI requires all scalar return values to be at least 32 - // bits in size. fp16 normally uses .b16 as its storage type in - // PTX, so its size must be adjusted here, too. - if (size < 32) - size = 32; - - O << ".param .b" << size << " _"; - } else if (isa(retTy)) { - O << ".param .b" << PtrVT.getSizeInBits() << " _"; - } else if (retTy->isAggregateType() || retTy->isVectorTy()) { - auto &DL = CS->getCalledFunction()->getParent()->getDataLayout(); - O << ".param .align " << retAlignment << " .b8 _[" - << DL.getTypeAllocSize(retTy) << "]"; - } else { - llvm_unreachable("Unknown return type"); - } - O << ") "; - } - O << "_ ("; - - bool first = true; - - unsigned OIdx = 0; - for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { - Type *Ty = Args[i].Ty; - if (!first) { - O << ", "; - } - first = false; - - if (!Outs[OIdx].Flags.isByVal()) { - if (Ty->isAggregateType() || Ty->isVectorTy()) { - unsigned align = 0; - const CallInst *CallI = cast(CS->getInstruction()); - // +1 because index 0 is reserved for return type alignment - if (!getAlign(*CallI, i + 1, align)) - align = DL.getABITypeAlignment(Ty); - unsigned sz = DL.getTypeAllocSize(Ty); - O << ".param .align " << align << " .b8 "; - O << "_"; - O << "[" << sz << "]"; - // update the index for Outs - SmallVector vtparts; - ComputeValueVTs(*this, DL, Ty, vtparts); - if (unsigned len = vtparts.size()) - OIdx += len - 1; - continue; - } - // i8 types in IR will be i16 types in SDAG - assert((getValueType(DL, Ty) == Outs[OIdx].VT || - (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && - "type mismatch between callee prototype and arguments"); - // scalar type - unsigned sz = 0; - if (isa(Ty)) { - sz = cast(Ty)->getBitWidth(); - if (sz < 32) - sz = 32; - } else if (isa(Ty)) { - sz = PtrVT.getSizeInBits(); - } else if (Ty->isHalfTy()) - // PTX ABI requires all scalar parameters to be at least 32 - // bits in size. fp16 normally uses .b16 as its storage type - // in PTX, so its size must be adjusted here, too. - sz = 32; - else - sz = Ty->getPrimitiveSizeInBits(); - O << ".param .b" << sz << " "; - O << "_"; - continue; - } - auto *PTy = dyn_cast(Ty); - assert(PTy && "Param with byval attribute should be a pointer type"); - Type *ETy = PTy->getElementType(); - - unsigned align = Outs[OIdx].Flags.getByValAlign(); - unsigned sz = DL.getTypeAllocSize(ETy); - O << ".param .align " << align << " .b8 "; - O << "_"; - O << "[" << sz << "]"; - } - O << ");"; - return O.str(); -} - -unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, - const ImmutableCallSite *CS, - Type *Ty, unsigned Idx, - const DataLayout &DL) const { - if (!CS) { - // CallSite is zero, fallback to ABI type alignment - return DL.getABITypeAlignment(Ty); - } - - unsigned Align = 0; - const Value *DirectCallee = CS->getCalledFunction(); - - if (!DirectCallee) { - // We don't have a direct function symbol, but that may be because of - // constant cast instructions in the call. - const Instruction *CalleeI = CS->getInstruction(); - assert(CalleeI && "Call target is not a function or derived value?"); - - // With bitcast'd call targets, the instruction will be the call - if (isa(CalleeI)) { - // Check if we have call alignment metadata - if (getAlign(*cast(CalleeI), Idx, Align)) - return Align; - - const Value *CalleeV = cast(CalleeI)->getCalledValue(); - // Ignore any bitcast instructions - while (isa(CalleeV)) { - const ConstantExpr *CE = cast(CalleeV); - if (!CE->isCast()) - break; - // Look through the bitcast - CalleeV = cast(CalleeV)->getOperand(0); - } - - // We have now looked past all of the bitcasts. Do we finally have a - // Function? - if (isa(CalleeV)) - DirectCallee = CalleeV; - } - } - - // Check for function alignment information if we found that the - // ultimate target is a Function - if (DirectCallee) - if (getAlign(*cast(DirectCallee), Idx, Align)) - return Align; - - // Call is indirect or alignment information is not available, fall back to - // the ABI type alignment - return DL.getABITypeAlignment(Ty); -} - -SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, - SmallVectorImpl &InVals) const { - SelectionDAG &DAG = CLI.DAG; - SDLoc dl = CLI.DL; - SmallVectorImpl &Outs = CLI.Outs; - SmallVectorImpl &OutVals = CLI.OutVals; - SmallVectorImpl &Ins = CLI.Ins; - SDValue Chain = CLI.Chain; - SDValue Callee = CLI.Callee; - bool &isTailCall = CLI.IsTailCall; - ArgListTy &Args = CLI.getArgs(); - Type *RetTy = CLI.RetTy; - ImmutableCallSite *CS = CLI.CS; - const DataLayout &DL = DAG.getDataLayout(); - - bool isABI = (STI.getSmVersion() >= 20); - assert(isABI && "Non-ABI compilation is not supported"); - if (!isABI) - return Chain; - - SDValue tempChain = Chain; - Chain = DAG.getCALLSEQ_START( - Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl); - SDValue InFlag = Chain.getValue(1); - - unsigned paramCount = 0; - // Args.size() and Outs.size() need not match. - // Outs.size() will be larger - // * if there is an aggregate argument with multiple fields (each field - // showing up separately in Outs) - // * if there is a vector argument with more than typical vector-length - // elements (generally if more than 4) where each vector element is - // individually present in Outs. - // So a different index should be used for indexing into Outs/OutVals. - // See similar issue in LowerFormalArguments. - unsigned OIdx = 0; - // Declare the .params or .reg need to pass values - // to the function - for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { - EVT VT = Outs[OIdx].VT; - Type *Ty = Args[i].Ty; - - if (!Outs[OIdx].Flags.isByVal()) { - SmallVector VTs; - SmallVector Offsets; - ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets); - unsigned ArgAlign = - getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); - unsigned AllocSize = DL.getTypeAllocSize(Ty); - SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - bool NeedAlign; // Does argument declaration specify alignment? - if (Ty->isAggregateType() || Ty->isVectorTy()) { - // declare .param .align .b8 .param[]; - SDValue DeclareParamOps[] = { - Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(AllocSize, dl, MVT::i32), InFlag}; - Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps); - NeedAlign = true; - } else { - // declare .param .b .param; - if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) { - // PTX ABI requires integral types to be at least 32 bits in - // size. FP16 is loaded/stored using i16, so it's handled - // here as well. - AllocSize = 4; - } - SDValue DeclareScalarParamOps[] = { - Chain, DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(AllocSize * 8, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), InFlag}; - Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, - DeclareScalarParamOps); - NeedAlign = false; - } - InFlag = Chain.getValue(1); - - // PTX Interoperability Guide 3.3(A): [Integer] Values shorter - // than 32-bits are sign extended or zero extended, depending on - // whether they are signed or unsigned types. This case applies - // only to scalar parameters and not to aggregate values. - bool ExtendIntegerParam = - Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; - - auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); - SmallVector StoreOperands; - for (unsigned j = 0, je = VTs.size(); j != je; ++j) { - // New store. - if (VectorInfo[j] & PVF_FIRST) { - assert(StoreOperands.empty() && "Unfinished preceeding store."); - StoreOperands.push_back(Chain); - StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); - StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); - } - - EVT EltVT = VTs[j]; - SDValue StVal = OutVals[OIdx]; - if (ExtendIntegerParam) { - assert(VTs.size() == 1 && "Scalar can't have multiple parts."); - // zext/sext to i32 - StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND, - dl, MVT::i32, StVal); - } else if (EltVT.getSizeInBits() < 16) { - // Use 16-bit registers for small stores as it's the - // smallest general purpose register size supported by NVPTX. - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - - // Record the value to store. - StoreOperands.push_back(StVal); - - if (VectorInfo[j] & PVF_LAST) { - unsigned NumElts = StoreOperands.size() - 3; - NVPTXISD::NodeType Op; - switch (NumElts) { - case 1: - Op = NVPTXISD::StoreParam; - break; - case 2: - Op = NVPTXISD::StoreParamV2; - break; - case 4: - Op = NVPTXISD::StoreParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); - } - - StoreOperands.push_back(InFlag); - - // Adjust type of the store op if we've extended the scalar - // return value. - EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j]; - unsigned EltAlign = - NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0; - - Chain = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, - TheStoreType, MachinePointerInfo(), EltAlign); - InFlag = Chain.getValue(1); - - // Cleanup. - StoreOperands.clear(); - } - ++OIdx; - } - assert(StoreOperands.empty() && "Unfinished parameter store."); - if (VTs.size() > 0) - --OIdx; - ++paramCount; - continue; - } - - // ByVal arguments - SmallVector VTs; - SmallVector Offsets; - auto *PTy = dyn_cast(Args[i].Ty); - assert(PTy && "Type of a byval parameter should be pointer"); - ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0); - - // declare .param .align .b8 .param[]; - unsigned sz = Outs[OIdx].Flags.getByValSize(); - SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign(); - // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, - // so we don't need to worry about natural alignment or not. - // See TargetLowering::LowerCallTo(). - - // Enforce minumum alignment of 4 to work around ptxas miscompile - // for sm_50+. See corresponding alignment adjustment in - // emitFunctionParamList() for details. - if (ArgAlign < 4) - ArgAlign = 4; - SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(sz, dl, MVT::i32), InFlag}; - Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps); - InFlag = Chain.getValue(1); - for (unsigned j = 0, je = VTs.size(); j != je; ++j) { - EVT elemtype = VTs[j]; - int curOffset = Offsets[j]; - unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); - auto PtrVT = getPointerTy(DL); - SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], - DAG.getConstant(curOffset, dl, PtrVT)); - SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, - MachinePointerInfo(), PartAlign); - if (elemtype.getSizeInBits() < 16) { - theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); - } - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, dl, MVT::i32), - DAG.getConstant(curOffset, dl, MVT::i32), - theVal, InFlag }; - Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, - CopyParamOps, elemtype, - MachinePointerInfo()); - - InFlag = Chain.getValue(1); - } - ++paramCount; - } - - GlobalAddressSDNode *Func = dyn_cast(Callee.getNode()); - unsigned retAlignment = 0; - - // Handle Result - if (Ins.size() > 0) { - SmallVector resvtparts; - ComputeValueVTs(*this, DL, RetTy, resvtparts); - - // Declare - // .param .align 16 .b8 retval0[], or - // .param .b retval0 - unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); - // Emit ".param .b retval0" instead of byte arrays only for - // these three types to match the logic in - // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. - // Plus, this behavior is consistent with nvcc's. - if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy() || - RetTy->isPointerTy()) { - // Scalar needs to be at least 32bit wide - if (resultsz < 32) - resultsz = 32; - SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(resultsz, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, - DeclareRetOps); - InFlag = Chain.getValue(1); - } else { - retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL); - SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareRetOps[] = { Chain, - DAG.getConstant(retAlignment, dl, MVT::i32), - DAG.getConstant(resultsz / 8, dl, MVT::i32), - DAG.getConstant(0, dl, MVT::i32), InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, - DeclareRetOps); - InFlag = Chain.getValue(1); - } - } - - if (!Func) { - // This is indirect function call case : PTX requires a prototype of the - // form - // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); - // to be emitted, and the label has to used as the last arg of call - // instruction. - // The prototype is embedded in a string and put as the operand for a - // CallPrototype SDNode which will print out to the value of the string. - SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); - std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS); - const char *ProtoStr = - nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); - SDValue ProtoOps[] = { - Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag, - }; - Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); - InFlag = Chain.getValue(1); - } - // Op to just print "call" - SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue PrintCallOps[] = { - Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag - }; - // We model convergent calls as separate opcodes. - unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall; - if (CLI.IsConvergent) - Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni - : NVPTXISD::PrintConvergentCall; - Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); - InFlag = Chain.getValue(1); - - // Ops to print out the function name - SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CallVoidOps[] = { Chain, Callee, InFlag }; - Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); - InFlag = Chain.getValue(1); - - // Ops to print out the param list - SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CallArgBeginOps[] = { Chain, InFlag }; - Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, - CallArgBeginOps); - InFlag = Chain.getValue(1); - - for (unsigned i = 0, e = paramCount; i != e; ++i) { - unsigned opcode; - if (i == (e - 1)) - opcode = NVPTXISD::LastCallArg; - else - opcode = NVPTXISD::CallArg; - SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(i, dl, MVT::i32), InFlag }; - Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); - InFlag = Chain.getValue(1); - } - SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CallArgEndOps[] = { Chain, - DAG.getConstant(Func ? 1 : 0, dl, MVT::i32), - InFlag }; - Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); - InFlag = Chain.getValue(1); - - if (!Func) { - SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue PrototypeOps[] = { Chain, - DAG.getConstant(uniqueCallSite, dl, MVT::i32), - InFlag }; - Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); - InFlag = Chain.getValue(1); - } - - // Generate loads from param memory/moves from registers for result - if (Ins.size() > 0) { - SmallVector VTs; - SmallVector Offsets; - ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); - assert(VTs.size() == Ins.size() && "Bad value decomposition"); - - unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL); - auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); - - SmallVector LoadVTs; - int VecIdx = -1; // Index of the first element of the vector. - - // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than - // 32-bits are sign extended or zero extended, depending on whether - // they are signed or unsigned types. - bool ExtendIntegerRetVal = - RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; - - for (unsigned i = 0, e = VTs.size(); i != e; ++i) { - bool needTruncate = false; - EVT TheLoadType = VTs[i]; - EVT EltType = Ins[i].VT; - unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]); - if (ExtendIntegerRetVal) { - TheLoadType = MVT::i32; - EltType = MVT::i32; - needTruncate = true; - } else if (TheLoadType.getSizeInBits() < 16) { - if (VTs[i].isInteger()) - needTruncate = true; - EltType = MVT::i16; - } - - // Record index of the very first element of the vector. - if (VectorInfo[i] & PVF_FIRST) { - assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); - VecIdx = i; - } - - LoadVTs.push_back(EltType); - - if (VectorInfo[i] & PVF_LAST) { - unsigned NumElts = LoadVTs.size(); - LoadVTs.push_back(MVT::Other); - LoadVTs.push_back(MVT::Glue); - NVPTXISD::NodeType Op; - switch (NumElts) { - case 1: - Op = NVPTXISD::LoadParam; - break; - case 2: - Op = NVPTXISD::LoadParamV2; - break; - case 4: - Op = NVPTXISD::LoadParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); - } - - SDValue LoadOperands[] = { - Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; - SDValue RetVal = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, - MachinePointerInfo(), EltAlign); - - for (unsigned j = 0; j < NumElts; ++j) { - SDValue Ret = RetVal.getValue(j); - if (needTruncate) - Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret); - InVals.push_back(Ret); - } - Chain = RetVal.getValue(NumElts); - InFlag = RetVal.getValue(NumElts + 1); - - // Cleanup - VecIdx = -1; - LoadVTs.clear(); - } - } - } - - Chain = DAG.getCALLSEQ_END(Chain, - DAG.getIntPtrConstant(uniqueCallSite, dl, true), - DAG.getIntPtrConstant(uniqueCallSite + 1, dl, - true), - InFlag, dl); - uniqueCallSite++; - - // set isTailCall to false for now, until we figure out how to express - // tail call optimization in PTX - isTailCall = false; - return Chain; -} - -// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() -// (see LegalizeDAG.cpp). This is slow and uses local memory. -// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 -SDValue -NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { - SDNode *Node = Op.getNode(); - SDLoc dl(Node); - SmallVector Ops; - unsigned NumOperands = Node->getNumOperands(); - for (unsigned i = 0; i < NumOperands; ++i) { - SDValue SubOp = Node->getOperand(i); - EVT VVT = SubOp.getNode()->getValueType(0); - EVT EltVT = VVT.getVectorElementType(); - unsigned NumSubElem = VVT.getVectorNumElements(); - for (unsigned j = 0; j < NumSubElem; ++j) { - Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, - DAG.getIntPtrConstant(j, dl))); - } - } - return DAG.getBuildVector(Node->getValueType(0), dl, Ops); -} - -// We can init constant f16x2 with a single .b32 move. Normally it -// would get lowered as two constant loads and vector-packing move. -// mov.b16 %h1, 0x4000; -// mov.b16 %h2, 0x3C00; -// mov.b32 %hh2, {%h2, %h1}; -// Instead we want just a constant move: -// mov.b32 %hh2, 0x40003C00 -// -// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 -// generates good SASS in both cases. -SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, - SelectionDAG &DAG) const { - //return Op; - if (!(Op->getValueType(0) == MVT::v2f16 && - isa(Op->getOperand(0)) && - isa(Op->getOperand(1)))) - return Op; - - APInt E0 = - cast(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); - APInt E1 = - cast(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); - SDValue Const = - DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); - return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); -} - -SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { - SDValue Index = Op->getOperand(1); - // Constant index will be matched by tablegen. - if (isa(Index.getNode())) - return Op; - - // Extract individual elements and select one of them. - SDValue Vector = Op->getOperand(0); - EVT VectorVT = Vector.getValueType(); - assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); - EVT EltVT = VectorVT.getVectorElementType(); - - SDLoc dl(Op.getNode()); - SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, - DAG.getIntPtrConstant(0, dl)); - SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, - DAG.getIntPtrConstant(1, dl)); - return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, - ISD::CondCode::SETEQ); -} - -/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which -/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift -/// amount, or -/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift -/// amount. -SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, - SelectionDAG &DAG) const { - assert(Op.getNumOperands() == 3 && "Not a double-shift!"); - assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); - - EVT VT = Op.getValueType(); - unsigned VTBits = VT.getSizeInBits(); - SDLoc dl(Op); - SDValue ShOpLo = Op.getOperand(0); - SDValue ShOpHi = Op.getOperand(1); - SDValue ShAmt = Op.getOperand(2); - unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; - - if (VTBits == 32 && STI.getSmVersion() >= 35) { - // For 32bit and sm35, we can use the funnel shift 'shf' instruction. - // {dHi, dLo} = {aHi, aLo} >> Amt - // dHi = aHi >> Amt - // dLo = shf.r.clamp aLo, aHi, Amt - - SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); - SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, - ShAmt); - - SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, dl); - } - else { - // {dHi, dLo} = {aHi, aLo} >> Amt - // - if (Amt>=size) then - // dLo = aHi >> (Amt-size) - // dHi = aHi >> Amt (this is either all 0 or all 1) - // else - // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) - // dHi = aHi >> Amt - - SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, - DAG.getConstant(VTBits, dl, MVT::i32), - ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); - SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, - DAG.getConstant(VTBits, dl, MVT::i32)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); - SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); - SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); - - SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, - DAG.getConstant(VTBits, dl, MVT::i32), - ISD::SETGE); - SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); - SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); - - SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, dl); - } -} - -/// LowerShiftLeftParts - Lower SHL_PARTS, which -/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift -/// amount, or -/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift -/// amount. -SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, - SelectionDAG &DAG) const { - assert(Op.getNumOperands() == 3 && "Not a double-shift!"); - assert(Op.getOpcode() == ISD::SHL_PARTS); - - EVT VT = Op.getValueType(); - unsigned VTBits = VT.getSizeInBits(); - SDLoc dl(Op); - SDValue ShOpLo = Op.getOperand(0); - SDValue ShOpHi = Op.getOperand(1); - SDValue ShAmt = Op.getOperand(2); - - if (VTBits == 32 && STI.getSmVersion() >= 35) { - // For 32bit and sm35, we can use the funnel shift 'shf' instruction. - // {dHi, dLo} = {aHi, aLo} << Amt - // dHi = shf.l.clamp aLo, aHi, Amt - // dLo = aLo << Amt - - SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, - ShAmt); - SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); - - SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, dl); - } - else { - // {dHi, dLo} = {aHi, aLo} << Amt - // - if (Amt>=size) then - // dLo = aLo << Amt (all 0) - // dLo = aLo << (Amt-size) - // else - // dLo = aLo << Amt - // dHi = (aHi << Amt) | (aLo >> (size-Amt)) - - SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, - DAG.getConstant(VTBits, dl, MVT::i32), - ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); - SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, - DAG.getConstant(VTBits, dl, MVT::i32)); - SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); - SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); - SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); - - SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, - DAG.getConstant(VTBits, dl, MVT::i32), - ISD::SETGE); - SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); - SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); - - SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, dl); - } -} - -SDValue -NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { - switch (Op.getOpcode()) { - case ISD::RETURNADDR: - return SDValue(); - case ISD::FRAMEADDR: - return SDValue(); - case ISD::GlobalAddress: - return LowerGlobalAddress(Op, DAG); - case ISD::INTRINSIC_W_CHAIN: - return Op; - case ISD::BUILD_VECTOR: - return LowerBUILD_VECTOR(Op, DAG); - case ISD::EXTRACT_SUBVECTOR: - return Op; - case ISD::EXTRACT_VECTOR_ELT: - return LowerEXTRACT_VECTOR_ELT(Op, DAG); - case ISD::CONCAT_VECTORS: - return LowerCONCAT_VECTORS(Op, DAG); - case ISD::STORE: - return LowerSTORE(Op, DAG); - case ISD::LOAD: - return LowerLOAD(Op, DAG); - case ISD::SHL_PARTS: - return LowerShiftLeftParts(Op, DAG); - case ISD::SRA_PARTS: - case ISD::SRL_PARTS: - return LowerShiftRightParts(Op, DAG); - case ISD::SELECT: - return LowerSelect(Op, DAG); - default: - llvm_unreachable("Custom lowering not defined for operation"); - } -} - -SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { - SDValue Op0 = Op->getOperand(0); - SDValue Op1 = Op->getOperand(1); - SDValue Op2 = Op->getOperand(2); - SDLoc DL(Op.getNode()); - - assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); - - Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); - Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); - SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); - - return Trunc; -} - -SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType() == MVT::i1) - return LowerLOADi1(Op, DAG); - - // v2f16 is legal, so we can't rely on legalizer to handle unaligned - // loads and have to handle it here. - if (Op.getValueType() == MVT::v2f16) { - LoadSDNode *Load = cast(Op); - EVT MemVT = Load->getMemoryVT(); - if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - Load->getAddressSpace(), Load->getAlignment())) { - SDValue Ops[2]; - std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); - return DAG.getMergeValues(Ops, SDLoc(Op)); - } - } - - return SDValue(); -} - -// v = ld i1* addr -// => -// v1 = ld i8* addr (-> i16) -// v = trunc i16 to i1 -SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { - SDNode *Node = Op.getNode(); - LoadSDNode *LD = cast(Node); - SDLoc dl(Node); - assert(LD->getExtensionType() == ISD::NON_EXTLOAD); - assert(Node->getValueType(0) == MVT::i1 && - "Custom lowering for i1 load only"); - SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), - LD->getPointerInfo(), LD->getAlignment(), - LD->getMemOperand()->getFlags()); - SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); - // The legalizer (the caller) is expecting two values from the legalized - // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() - // in LegalizeDAG.cpp which also uses MergeValues. - SDValue Ops[] = { result, LD->getChain() }; - return DAG.getMergeValues(Ops, dl); -} - -SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - StoreSDNode *Store = cast(Op); - EVT VT = Store->getMemoryVT(); - - if (VT == MVT::i1) - return LowerSTOREi1(Op, DAG); - - // v2f16 is legal, so we can't rely on legalizer to handle unaligned - // stores and have to handle it here. - if (VT == MVT::v2f16 && - !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - Store->getAddressSpace(), Store->getAlignment())) - return expandUnalignedStore(Store, DAG); - - if (VT.isVector()) - return LowerSTOREVector(Op, DAG); - - return SDValue(); -} - -SDValue -NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { - SDNode *N = Op.getNode(); - SDValue Val = N->getOperand(1); - SDLoc DL(N); - EVT ValVT = Val.getValueType(); - - if (ValVT.isVector()) { - // We only handle "native" vector sizes for now, e.g. <4 x double> is not - // legal. We can (and should) split that into 2 stores of <2 x double> here - // but I'm leaving that as a TODO for now. - if (!ValVT.isSimple()) - return SDValue(); - switch (ValVT.getSimpleVT().SimpleTy) { - default: - return SDValue(); - case MVT::v2i8: - case MVT::v2i16: - case MVT::v2i32: - case MVT::v2i64: - case MVT::v2f16: - case MVT::v2f32: - case MVT::v2f64: - case MVT::v4i8: - case MVT::v4i16: - case MVT::v4i32: - case MVT::v4f16: - case MVT::v4f32: - case MVT::v8f16: // <4 x f16x2> - // This is a "native" vector type - break; - } - - MemSDNode *MemSD = cast(N); - const DataLayout &TD = DAG.getDataLayout(); - - unsigned Align = MemSD->getAlignment(); - unsigned PrefAlign = - TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext())); - if (Align < PrefAlign) { - // This store is not sufficiently aligned, so bail out and let this vector - // store be scalarized. Note that we may still be able to emit smaller - // vector stores. For example, if we are storing a <4 x float> with an - // alignment of 8, this check will fail but the legalizer will try again - // with 2 x <2 x float>, which will succeed with an alignment of 8. - return SDValue(); - } - - unsigned Opcode = 0; - EVT EltVT = ValVT.getVectorElementType(); - unsigned NumElts = ValVT.getVectorNumElements(); - - // Since StoreV2 is a target node, we cannot rely on DAG type legalization. - // Therefore, we must ensure the type is legal. For i1 and i8, we set the - // stored type to i16 and propagate the "real" type as the memory type. - bool NeedExt = false; - if (EltVT.getSizeInBits() < 16) - NeedExt = true; - - bool StoreF16x2 = false; - switch (NumElts) { - default: - return SDValue(); - case 2: - Opcode = NVPTXISD::StoreV2; - break; - case 4: - Opcode = NVPTXISD::StoreV4; - break; - case 8: - // v8f16 is a special case. PTX doesn't have st.v8.f16 - // instruction. Instead, we split the vector into v2f16 chunks and - // store them with st.v4.b32. - assert(EltVT == MVT::f16 && "Wrong type for the vector."); - Opcode = NVPTXISD::StoreV4; - StoreF16x2 = true; - break; - } - - SmallVector Ops; - - // First is the chain - Ops.push_back(N->getOperand(0)); - - if (StoreF16x2) { - // Combine f16,f16 -> v2f16 - NumElts /= 2; - for (unsigned i = 0; i < NumElts; ++i) { - SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, - DAG.getIntPtrConstant(i * 2, DL)); - SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, - DAG.getIntPtrConstant(i * 2 + 1, DL)); - SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1); - Ops.push_back(V2); - } - } else { - // Then the split values - for (unsigned i = 0; i < NumElts; ++i) { - SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, - DAG.getIntPtrConstant(i, DL)); - if (NeedExt) - ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); - Ops.push_back(ExtVal); - } - } - - // Then any remaining arguments - Ops.append(N->op_begin() + 2, N->op_end()); - - SDValue NewSt = - DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, - MemSD->getMemoryVT(), MemSD->getMemOperand()); - - // return DCI.CombineTo(N, NewSt, true); - return NewSt; - } - - return SDValue(); -} - -// st i1 v, addr -// => -// v1 = zxt v to i16 -// st.u8 i16, addr -SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { - SDNode *Node = Op.getNode(); - SDLoc dl(Node); - StoreSDNode *ST = cast(Node); - SDValue Tmp1 = ST->getChain(); - SDValue Tmp2 = ST->getBasePtr(); - SDValue Tmp3 = ST->getValue(); - assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); - Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); - SDValue Result = - DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, - ST->getAlignment(), ST->getMemOperand()->getFlags()); - return Result; -} - -SDValue -NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { - std::string ParamSym; - raw_string_ostream ParamStr(ParamSym); - - ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx; - ParamStr.flush(); - - std::string *SavedStr = - nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str()); - return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); -} - -// Check to see if the kernel argument is image*_t or sampler_t - -static bool isImageOrSamplerVal(const Value *arg, const Module *context) { - static const char *const specialTypes[] = { "struct._image2d_t", - "struct._image3d_t", - "struct._sampler_t" }; - - Type *Ty = arg->getType(); - auto *PTy = dyn_cast(Ty); - - if (!PTy) - return false; - - if (!context) - return false; - - auto *STy = dyn_cast(PTy->getElementType()); - if (!STy || STy->isLiteral()) - return false; - - return std::find(std::begin(specialTypes), std::end(specialTypes), - STy->getName()) != std::end(specialTypes); -} - -SDValue NVPTXTargetLowering::LowerFormalArguments( - SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, const SDLoc &dl, - SelectionDAG &DAG, SmallVectorImpl &InVals) const { - MachineFunction &MF = DAG.getMachineFunction(); - const DataLayout &DL = DAG.getDataLayout(); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - - const Function *F = MF.getFunction(); - const AttributeList &PAL = F->getAttributes(); - const TargetLowering *TLI = STI.getTargetLowering(); - - SDValue Root = DAG.getRoot(); - std::vector OutChains; - - bool isABI = (STI.getSmVersion() >= 20); - assert(isABI && "Non-ABI compilation is not supported"); - if (!isABI) - return Chain; - - std::vector argTypes; - std::vector theArgs; - for (const Argument &I : F->args()) { - theArgs.push_back(&I); - argTypes.push_back(I.getType()); - } - // argTypes.size() (or theArgs.size()) and Ins.size() need not match. - // Ins.size() will be larger - // * if there is an aggregate argument with multiple fields (each field - // showing up separately in Ins) - // * if there is a vector argument with more than typical vector-length - // elements (generally if more than 4) where each vector element is - // individually present in Ins. - // So a different index should be used for indexing into Ins. - // See similar issue in LowerCall. - unsigned InsIdx = 0; - - int idx = 0; - for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { - Type *Ty = argTypes[i]; - - // If the kernel argument is image*_t or sampler_t, convert it to - // a i32 constant holding the parameter position. This can later - // matched in the AsmPrinter to output the correct mangled name. - if (isImageOrSamplerVal( - theArgs[i], - (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() - : nullptr))) { - assert(isKernelFunction(*F) && - "Only kernels can have image/sampler params"); - InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); - continue; - } - - if (theArgs[i]->use_empty()) { - // argument is dead - if (Ty->isAggregateType()) { - SmallVector vtparts; - - ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); - assert(vtparts.size() > 0 && "empty aggregate type not expected"); - for (unsigned parti = 0, parte = vtparts.size(); parti != parte; - ++parti) { - InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); - ++InsIdx; - } - if (vtparts.size() > 0) - --InsIdx; - continue; - } - if (Ty->isVectorTy()) { - EVT ObjectVT = getValueType(DL, Ty); - unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); - for (unsigned parti = 0; parti < NumRegs; ++parti) { - InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); - ++InsIdx; - } - if (NumRegs > 0) - --InsIdx; - continue; - } - InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); - continue; - } - - // In the following cases, assign a node order of "idx+1" - // to newly created nodes. The SDNodes for params have to - // appear in the same order as their order of appearance - // in the original function. "idx+1" holds that order. - if (!PAL.hasParamAttribute(i, Attribute::ByVal)) { - bool aggregateIsPacked = false; - if (StructType *STy = dyn_cast(Ty)) - aggregateIsPacked = STy->isPacked(); - - SmallVector VTs; - SmallVector Offsets; - ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); - assert(VTs.size() > 0 && "Unexpected empty type."); - auto VectorInfo = - VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty)); - - SDValue Arg = getParamSymbol(DAG, idx, PtrVT); - int VecIdx = -1; // Index of the first element of the current vector. - for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { - if (VectorInfo[parti] & PVF_FIRST) { - assert(VecIdx == -1 && "Orphaned vector."); - VecIdx = parti; - } - - // That's the last element of this store op. - if (VectorInfo[parti] & PVF_LAST) { - unsigned NumElts = parti - VecIdx + 1; - EVT EltVT = VTs[parti]; - // i1 is loaded/stored as i8. - EVT LoadVT = EltVT; - if (EltVT == MVT::i1) - LoadVT = MVT::i8; - else if (EltVT == MVT::v2f16) - // getLoad needs a vector type, but it can't handle - // vectors which contain v2f16 elements. So we must load - // using i32 here and then bitcast back. - LoadVT = MVT::i32; - - EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); - SDValue VecAddr = - DAG.getNode(ISD::ADD, dl, PtrVT, Arg, - DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); - Value *srcValue = Constant::getNullValue(PointerType::get( - EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); - SDValue P = - DAG.getLoad(VecVT, dl, Root, VecAddr, - MachinePointerInfo(srcValue), aggregateIsPacked, - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); - if (P.getNode()) - P.getNode()->setIROrder(idx + 1); - for (unsigned j = 0; j < NumElts; ++j) { - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, - DAG.getIntPtrConstant(j, dl)); - // We've loaded i1 as an i8 and now must truncate it back to i1 - if (EltVT == MVT::i1) - Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); - // v2f16 was loaded as an i32. Now we must bitcast it back. - else if (EltVT == MVT::v2f16) - Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); - // Extend the element if necesary (e.g. an i8 is loaded - // into an i16 register) - if (Ins[InsIdx].VT.isInteger() && - Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) { - unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND; - Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); - } - InVals.push_back(Elt); - } - - // Reset vector tracking state. - VecIdx = -1; - } - ++InsIdx; - } - if (VTs.size() > 0) - --InsIdx; - continue; - } - - // Param has ByVal attribute - // Return MoveParam(param symbol). - // Ideally, the param symbol can be returned directly, - // but when SDNode builder decides to use it in a CopyToReg(), - // machine instruction fails because TargetExternalSymbol - // (not lowered) is target dependent, and CopyToReg assumes - // the source is lowered. - EVT ObjectVT = getValueType(DL, Ty); - assert(ObjectVT == Ins[InsIdx].VT && - "Ins type did not match function type"); - SDValue Arg = getParamSymbol(DAG, idx, PtrVT); - SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); - if (p.getNode()) - p.getNode()->setIROrder(idx + 1); - InVals.push_back(p); - } - - // Clang will check explicit VarArg and issue error if any. However, Clang - // will let code with - // implicit var arg like f() pass. See bug 617733. - // We treat this case as if the arg list is empty. - // if (F.isVarArg()) { - // assert(0 && "VarArg not supported yet!"); - //} - - if (!OutChains.empty()) - DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); - - return Chain; -} - -SDValue -NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - const SDLoc &dl, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - Type *RetTy = MF.getFunction()->getReturnType(); - - bool isABI = (STI.getSmVersion() >= 20); - assert(isABI && "Non-ABI compilation is not supported"); - if (!isABI) - return Chain; - - const DataLayout DL = DAG.getDataLayout(); - SmallVector VTs; - SmallVector Offsets; - ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); - assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); - - auto VectorInfo = VectorizePTXValueVTs( - VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1); - - // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than - // 32-bits are sign extended or zero extended, depending on whether - // they are signed or unsigned types. - bool ExtendIntegerRetVal = - RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; - - SmallVector StoreOperands; - for (unsigned i = 0, e = VTs.size(); i != e; ++i) { - // New load/store. Record chain and offset operands. - if (VectorInfo[i] & PVF_FIRST) { - assert(StoreOperands.empty() && "Orphaned operand list."); - StoreOperands.push_back(Chain); - StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); - } - - SDValue RetVal = OutVals[i]; - if (ExtendIntegerRetVal) { - RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND, - dl, MVT::i32, RetVal); - } else if (RetVal.getValueSizeInBits() < 16) { - // Use 16-bit registers for small load-stores as it's the - // smallest general purpose register size supported by NVPTX. - RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); - } - - // Record the value to return. - StoreOperands.push_back(RetVal); - - // That's the last element of this store op. - if (VectorInfo[i] & PVF_LAST) { - NVPTXISD::NodeType Op; - unsigned NumElts = StoreOperands.size() - 2; - switch (NumElts) { - case 1: - Op = NVPTXISD::StoreRetval; - break; - case 2: - Op = NVPTXISD::StoreRetvalV2; - break; - case 4: - Op = NVPTXISD::StoreRetvalV4; - break; - default: - llvm_unreachable("Invalid vector info."); - } - - // Adjust type of load/store op if we've extended the scalar - // return value. - EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; - Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other), - StoreOperands, TheStoreType, - MachinePointerInfo(), 1); - // Cleanup vector state. - StoreOperands.clear(); - } - } - - return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); -} - -void NVPTXTargetLowering::LowerAsmOperandForConstraint( - SDValue Op, std::string &Constraint, std::vector &Ops, - SelectionDAG &DAG) const { - if (Constraint.length() > 1) - return; - else - TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); -} - -static unsigned getOpcForTextureInstr(unsigned Intrinsic) { - switch (Intrinsic) { - default: - return 0; - - case Intrinsic::nvvm_tex_1d_v4f32_s32: - return NVPTXISD::Tex1DFloatS32; - case Intrinsic::nvvm_tex_1d_v4f32_f32: - return NVPTXISD::Tex1DFloatFloat; - case Intrinsic::nvvm_tex_1d_level_v4f32_f32: - return NVPTXISD::Tex1DFloatFloatLevel; - case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: - return NVPTXISD::Tex1DFloatFloatGrad; - case Intrinsic::nvvm_tex_1d_v4s32_s32: - return NVPTXISD::Tex1DS32S32; - case Intrinsic::nvvm_tex_1d_v4s32_f32: - return NVPTXISD::Tex1DS32Float; - case Intrinsic::nvvm_tex_1d_level_v4s32_f32: - return NVPTXISD::Tex1DS32FloatLevel; - case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: - return NVPTXISD::Tex1DS32FloatGrad; - case Intrinsic::nvvm_tex_1d_v4u32_s32: - return NVPTXISD::Tex1DU32S32; - case Intrinsic::nvvm_tex_1d_v4u32_f32: - return NVPTXISD::Tex1DU32Float; - case Intrinsic::nvvm_tex_1d_level_v4u32_f32: - return NVPTXISD::Tex1DU32FloatLevel; - case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: - return NVPTXISD::Tex1DU32FloatGrad; - - case Intrinsic::nvvm_tex_1d_array_v4f32_s32: - return NVPTXISD::Tex1DArrayFloatS32; - case Intrinsic::nvvm_tex_1d_array_v4f32_f32: - return NVPTXISD::Tex1DArrayFloatFloat; - case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: - return NVPTXISD::Tex1DArrayFloatFloatLevel; - case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: - return NVPTXISD::Tex1DArrayFloatFloatGrad; - case Intrinsic::nvvm_tex_1d_array_v4s32_s32: - return NVPTXISD::Tex1DArrayS32S32; - case Intrinsic::nvvm_tex_1d_array_v4s32_f32: - return NVPTXISD::Tex1DArrayS32Float; - case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: - return NVPTXISD::Tex1DArrayS32FloatLevel; - case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: - return NVPTXISD::Tex1DArrayS32FloatGrad; - case Intrinsic::nvvm_tex_1d_array_v4u32_s32: - return NVPTXISD::Tex1DArrayU32S32; - case Intrinsic::nvvm_tex_1d_array_v4u32_f32: - return NVPTXISD::Tex1DArrayU32Float; - case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: - return NVPTXISD::Tex1DArrayU32FloatLevel; - case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: - return NVPTXISD::Tex1DArrayU32FloatGrad; - - case Intrinsic::nvvm_tex_2d_v4f32_s32: - return NVPTXISD::Tex2DFloatS32; - case Intrinsic::nvvm_tex_2d_v4f32_f32: - return NVPTXISD::Tex2DFloatFloat; - case Intrinsic::nvvm_tex_2d_level_v4f32_f32: - return NVPTXISD::Tex2DFloatFloatLevel; - case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: - return NVPTXISD::Tex2DFloatFloatGrad; - case Intrinsic::nvvm_tex_2d_v4s32_s32: - return NVPTXISD::Tex2DS32S32; - case Intrinsic::nvvm_tex_2d_v4s32_f32: - return NVPTXISD::Tex2DS32Float; - case Intrinsic::nvvm_tex_2d_level_v4s32_f32: - return NVPTXISD::Tex2DS32FloatLevel; - case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: - return NVPTXISD::Tex2DS32FloatGrad; - case Intrinsic::nvvm_tex_2d_v4u32_s32: - return NVPTXISD::Tex2DU32S32; - case Intrinsic::nvvm_tex_2d_v4u32_f32: - return NVPTXISD::Tex2DU32Float; - case Intrinsic::nvvm_tex_2d_level_v4u32_f32: - return NVPTXISD::Tex2DU32FloatLevel; - case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: - return NVPTXISD::Tex2DU32FloatGrad; - - case Intrinsic::nvvm_tex_2d_array_v4f32_s32: - return NVPTXISD::Tex2DArrayFloatS32; - case Intrinsic::nvvm_tex_2d_array_v4f32_f32: - return NVPTXISD::Tex2DArrayFloatFloat; - case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: - return NVPTXISD::Tex2DArrayFloatFloatLevel; - case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: - return NVPTXISD::Tex2DArrayFloatFloatGrad; - case Intrinsic::nvvm_tex_2d_array_v4s32_s32: - return NVPTXISD::Tex2DArrayS32S32; - case Intrinsic::nvvm_tex_2d_array_v4s32_f32: - return NVPTXISD::Tex2DArrayS32Float; - case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: - return NVPTXISD::Tex2DArrayS32FloatLevel; - case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: - return NVPTXISD::Tex2DArrayS32FloatGrad; - case Intrinsic::nvvm_tex_2d_array_v4u32_s32: - return NVPTXISD::Tex2DArrayU32S32; - case Intrinsic::nvvm_tex_2d_array_v4u32_f32: - return NVPTXISD::Tex2DArrayU32Float; - case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: - return NVPTXISD::Tex2DArrayU32FloatLevel; - case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: - return NVPTXISD::Tex2DArrayU32FloatGrad; - - case Intrinsic::nvvm_tex_3d_v4f32_s32: - return NVPTXISD::Tex3DFloatS32; - case Intrinsic::nvvm_tex_3d_v4f32_f32: - return NVPTXISD::Tex3DFloatFloat; - case Intrinsic::nvvm_tex_3d_level_v4f32_f32: - return NVPTXISD::Tex3DFloatFloatLevel; - case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: - return NVPTXISD::Tex3DFloatFloatGrad; - case Intrinsic::nvvm_tex_3d_v4s32_s32: - return NVPTXISD::Tex3DS32S32; - case Intrinsic::nvvm_tex_3d_v4s32_f32: - return NVPTXISD::Tex3DS32Float; - case Intrinsic::nvvm_tex_3d_level_v4s32_f32: - return NVPTXISD::Tex3DS32FloatLevel; - case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: - return NVPTXISD::Tex3DS32FloatGrad; - case Intrinsic::nvvm_tex_3d_v4u32_s32: - return NVPTXISD::Tex3DU32S32; - case Intrinsic::nvvm_tex_3d_v4u32_f32: - return NVPTXISD::Tex3DU32Float; - case Intrinsic::nvvm_tex_3d_level_v4u32_f32: - return NVPTXISD::Tex3DU32FloatLevel; - case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: - return NVPTXISD::Tex3DU32FloatGrad; - - case Intrinsic::nvvm_tex_cube_v4f32_f32: - return NVPTXISD::TexCubeFloatFloat; - case Intrinsic::nvvm_tex_cube_level_v4f32_f32: - return NVPTXISD::TexCubeFloatFloatLevel; - case Intrinsic::nvvm_tex_cube_v4s32_f32: - return NVPTXISD::TexCubeS32Float; - case Intrinsic::nvvm_tex_cube_level_v4s32_f32: - return NVPTXISD::TexCubeS32FloatLevel; - case Intrinsic::nvvm_tex_cube_v4u32_f32: - return NVPTXISD::TexCubeU32Float; - case Intrinsic::nvvm_tex_cube_level_v4u32_f32: - return NVPTXISD::TexCubeU32FloatLevel; - - case Intrinsic::nvvm_tex_cube_array_v4f32_f32: - return NVPTXISD::TexCubeArrayFloatFloat; - case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: - return NVPTXISD::TexCubeArrayFloatFloatLevel; - case Intrinsic::nvvm_tex_cube_array_v4s32_f32: - return NVPTXISD::TexCubeArrayS32Float; - case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: - return NVPTXISD::TexCubeArrayS32FloatLevel; - case Intrinsic::nvvm_tex_cube_array_v4u32_f32: - return NVPTXISD::TexCubeArrayU32Float; - case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: - return NVPTXISD::TexCubeArrayU32FloatLevel; - - case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: - return NVPTXISD::Tld4R2DFloatFloat; - case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: - return NVPTXISD::Tld4G2DFloatFloat; - case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: - return NVPTXISD::Tld4B2DFloatFloat; - case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: - return NVPTXISD::Tld4A2DFloatFloat; - case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: - return NVPTXISD::Tld4R2DS64Float; - case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: - return NVPTXISD::Tld4G2DS64Float; - case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: - return NVPTXISD::Tld4B2DS64Float; - case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: - return NVPTXISD::Tld4A2DS64Float; - case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: - return NVPTXISD::Tld4R2DU64Float; - case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: - return NVPTXISD::Tld4G2DU64Float; - case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: - return NVPTXISD::Tld4B2DU64Float; - case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: - return NVPTXISD::Tld4A2DU64Float; - - case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: - return NVPTXISD::TexUnified1DFloatS32; - case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: - return NVPTXISD::TexUnified1DFloatFloat; - case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: - return NVPTXISD::TexUnified1DFloatFloatLevel; - case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: - return NVPTXISD::TexUnified1DFloatFloatGrad; - case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: - return NVPTXISD::TexUnified1DS32S32; - case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: - return NVPTXISD::TexUnified1DS32Float; - case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: - return NVPTXISD::TexUnified1DS32FloatLevel; - case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: - return NVPTXISD::TexUnified1DS32FloatGrad; - case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: - return NVPTXISD::TexUnified1DU32S32; - case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: - return NVPTXISD::TexUnified1DU32Float; - case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: - return NVPTXISD::TexUnified1DU32FloatLevel; - case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: - return NVPTXISD::TexUnified1DU32FloatGrad; - - case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: - return NVPTXISD::TexUnified1DArrayFloatS32; - case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: - return NVPTXISD::TexUnified1DArrayFloatFloat; - case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: - return NVPTXISD::TexUnified1DArrayFloatFloatLevel; - case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: - return NVPTXISD::TexUnified1DArrayFloatFloatGrad; - case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: - return NVPTXISD::TexUnified1DArrayS32S32; - case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: - return NVPTXISD::TexUnified1DArrayS32Float; - case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: - return NVPTXISD::TexUnified1DArrayS32FloatLevel; - case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: - return NVPTXISD::TexUnified1DArrayS32FloatGrad; - case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: - return NVPTXISD::TexUnified1DArrayU32S32; - case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: - return NVPTXISD::TexUnified1DArrayU32Float; - case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: - return NVPTXISD::TexUnified1DArrayU32FloatLevel; - case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: - return NVPTXISD::TexUnified1DArrayU32FloatGrad; - - case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: - return NVPTXISD::TexUnified2DFloatS32; - case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: - return NVPTXISD::TexUnified2DFloatFloat; - case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: - return NVPTXISD::TexUnified2DFloatFloatLevel; - case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: - return NVPTXISD::TexUnified2DFloatFloatGrad; - case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: - return NVPTXISD::TexUnified2DS32S32; - case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: - return NVPTXISD::TexUnified2DS32Float; - case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: - return NVPTXISD::TexUnified2DS32FloatLevel; - case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: - return NVPTXISD::TexUnified2DS32FloatGrad; - case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: - return NVPTXISD::TexUnified2DU32S32; - case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: - return NVPTXISD::TexUnified2DU32Float; - case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: - return NVPTXISD::TexUnified2DU32FloatLevel; - case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: - return NVPTXISD::TexUnified2DU32FloatGrad; - - case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: - return NVPTXISD::TexUnified2DArrayFloatS32; - case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: - return NVPTXISD::TexUnified2DArrayFloatFloat; - case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: - return NVPTXISD::TexUnified2DArrayFloatFloatLevel; - case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: - return NVPTXISD::TexUnified2DArrayFloatFloatGrad; - case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: - return NVPTXISD::TexUnified2DArrayS32S32; - case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: - return NVPTXISD::TexUnified2DArrayS32Float; - case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: - return NVPTXISD::TexUnified2DArrayS32FloatLevel; - case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: - return NVPTXISD::TexUnified2DArrayS32FloatGrad; - case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: - return NVPTXISD::TexUnified2DArrayU32S32; - case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: - return NVPTXISD::TexUnified2DArrayU32Float; - case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: - return NVPTXISD::TexUnified2DArrayU32FloatLevel; - case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: - return NVPTXISD::TexUnified2DArrayU32FloatGrad; - - case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: - return NVPTXISD::TexUnified3DFloatS32; - case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: - return NVPTXISD::TexUnified3DFloatFloat; - case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: - return NVPTXISD::TexUnified3DFloatFloatLevel; - case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: - return NVPTXISD::TexUnified3DFloatFloatGrad; - case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: - return NVPTXISD::TexUnified3DS32S32; - case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: - return NVPTXISD::TexUnified3DS32Float; - case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: - return NVPTXISD::TexUnified3DS32FloatLevel; - case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: - return NVPTXISD::TexUnified3DS32FloatGrad; - case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: - return NVPTXISD::TexUnified3DU32S32; - case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: - return NVPTXISD::TexUnified3DU32Float; - case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: - return NVPTXISD::TexUnified3DU32FloatLevel; - case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: - return NVPTXISD::TexUnified3DU32FloatGrad; - - case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: - return NVPTXISD::TexUnifiedCubeFloatFloat; - case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: - return NVPTXISD::TexUnifiedCubeFloatFloatLevel; - case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: - return NVPTXISD::TexUnifiedCubeS32Float; - case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: - return NVPTXISD::TexUnifiedCubeS32FloatLevel; - case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: - return NVPTXISD::TexUnifiedCubeU32Float; - case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: - return NVPTXISD::TexUnifiedCubeU32FloatLevel; - - case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: - return NVPTXISD::TexUnifiedCubeArrayFloatFloat; - case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: - return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; - case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: - return NVPTXISD::TexUnifiedCubeArrayS32Float; - case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: - return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; - case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: - return NVPTXISD::TexUnifiedCubeArrayU32Float; - case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: - return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; - - case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: - return NVPTXISD::Tld4UnifiedR2DFloatFloat; - case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: - return NVPTXISD::Tld4UnifiedG2DFloatFloat; - case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: - return NVPTXISD::Tld4UnifiedB2DFloatFloat; - case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: - return NVPTXISD::Tld4UnifiedA2DFloatFloat; - case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: - return NVPTXISD::Tld4UnifiedR2DS64Float; - case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: - return NVPTXISD::Tld4UnifiedG2DS64Float; - case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: - return NVPTXISD::Tld4UnifiedB2DS64Float; - case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: - return NVPTXISD::Tld4UnifiedA2DS64Float; - case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: - return NVPTXISD::Tld4UnifiedR2DU64Float; - case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: - return NVPTXISD::Tld4UnifiedG2DU64Float; - case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: - return NVPTXISD::Tld4UnifiedB2DU64Float; - case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: - return NVPTXISD::Tld4UnifiedA2DU64Float; - } -} - -static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { - switch (Intrinsic) { - default: - return 0; - case Intrinsic::nvvm_suld_1d_i8_clamp: - return NVPTXISD::Suld1DI8Clamp; - case Intrinsic::nvvm_suld_1d_i16_clamp: - return NVPTXISD::Suld1DI16Clamp; - case Intrinsic::nvvm_suld_1d_i32_clamp: - return NVPTXISD::Suld1DI32Clamp; - case Intrinsic::nvvm_suld_1d_i64_clamp: - return NVPTXISD::Suld1DI64Clamp; - case Intrinsic::nvvm_suld_1d_v2i8_clamp: - return NVPTXISD::Suld1DV2I8Clamp; - case Intrinsic::nvvm_suld_1d_v2i16_clamp: - return NVPTXISD::Suld1DV2I16Clamp; - case Intrinsic::nvvm_suld_1d_v2i32_clamp: - return NVPTXISD::Suld1DV2I32Clamp; - case Intrinsic::nvvm_suld_1d_v2i64_clamp: - return NVPTXISD::Suld1DV2I64Clamp; - case Intrinsic::nvvm_suld_1d_v4i8_clamp: - return NVPTXISD::Suld1DV4I8Clamp; - case Intrinsic::nvvm_suld_1d_v4i16_clamp: - return NVPTXISD::Suld1DV4I16Clamp; - case Intrinsic::nvvm_suld_1d_v4i32_clamp: - return NVPTXISD::Suld1DV4I32Clamp; - case Intrinsic::nvvm_suld_1d_array_i8_clamp: - return NVPTXISD::Suld1DArrayI8Clamp; - case Intrinsic::nvvm_suld_1d_array_i16_clamp: - return NVPTXISD::Suld1DArrayI16Clamp; - case Intrinsic::nvvm_suld_1d_array_i32_clamp: - return NVPTXISD::Suld1DArrayI32Clamp; - case Intrinsic::nvvm_suld_1d_array_i64_clamp: - return NVPTXISD::Suld1DArrayI64Clamp; - case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: - return NVPTXISD::Suld1DArrayV2I8Clamp; - case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: - return NVPTXISD::Suld1DArrayV2I16Clamp; - case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: - return NVPTXISD::Suld1DArrayV2I32Clamp; - case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: - return NVPTXISD::Suld1DArrayV2I64Clamp; - case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: - return NVPTXISD::Suld1DArrayV4I8Clamp; - case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: - return NVPTXISD::Suld1DArrayV4I16Clamp; - case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: - return NVPTXISD::Suld1DArrayV4I32Clamp; - case Intrinsic::nvvm_suld_2d_i8_clamp: - return NVPTXISD::Suld2DI8Clamp; - case Intrinsic::nvvm_suld_2d_i16_clamp: - return NVPTXISD::Suld2DI16Clamp; - case Intrinsic::nvvm_suld_2d_i32_clamp: - return NVPTXISD::Suld2DI32Clamp; - case Intrinsic::nvvm_suld_2d_i64_clamp: - return NVPTXISD::Suld2DI64Clamp; - case Intrinsic::nvvm_suld_2d_v2i8_clamp: - return NVPTXISD::Suld2DV2I8Clamp; - case Intrinsic::nvvm_suld_2d_v2i16_clamp: - return NVPTXISD::Suld2DV2I16Clamp; - case Intrinsic::nvvm_suld_2d_v2i32_clamp: - return NVPTXISD::Suld2DV2I32Clamp; - case Intrinsic::nvvm_suld_2d_v2i64_clamp: - return NVPTXISD::Suld2DV2I64Clamp; - case Intrinsic::nvvm_suld_2d_v4i8_clamp: - return NVPTXISD::Suld2DV4I8Clamp; - case Intrinsic::nvvm_suld_2d_v4i16_clamp: - return NVPTXISD::Suld2DV4I16Clamp; - case Intrinsic::nvvm_suld_2d_v4i32_clamp: - return NVPTXISD::Suld2DV4I32Clamp; - case Intrinsic::nvvm_suld_2d_array_i8_clamp: - return NVPTXISD::Suld2DArrayI8Clamp; - case Intrinsic::nvvm_suld_2d_array_i16_clamp: - return NVPTXISD::Suld2DArrayI16Clamp; - case Intrinsic::nvvm_suld_2d_array_i32_clamp: - return NVPTXISD::Suld2DArrayI32Clamp; - case Intrinsic::nvvm_suld_2d_array_i64_clamp: - return NVPTXISD::Suld2DArrayI64Clamp; - case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: - return NVPTXISD::Suld2DArrayV2I8Clamp; - case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: - return NVPTXISD::Suld2DArrayV2I16Clamp; - case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: - return NVPTXISD::Suld2DArrayV2I32Clamp; - case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: - return NVPTXISD::Suld2DArrayV2I64Clamp; - case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: - return NVPTXISD::Suld2DArrayV4I8Clamp; - case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: - return NVPTXISD::Suld2DArrayV4I16Clamp; - case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: - return NVPTXISD::Suld2DArrayV4I32Clamp; - case Intrinsic::nvvm_suld_3d_i8_clamp: - return NVPTXISD::Suld3DI8Clamp; - case Intrinsic::nvvm_suld_3d_i16_clamp: - return NVPTXISD::Suld3DI16Clamp; - case Intrinsic::nvvm_suld_3d_i32_clamp: - return NVPTXISD::Suld3DI32Clamp; - case Intrinsic::nvvm_suld_3d_i64_clamp: - return NVPTXISD::Suld3DI64Clamp; - case Intrinsic::nvvm_suld_3d_v2i8_clamp: - return NVPTXISD::Suld3DV2I8Clamp; - case Intrinsic::nvvm_suld_3d_v2i16_clamp: - return NVPTXISD::Suld3DV2I16Clamp; - case Intrinsic::nvvm_suld_3d_v2i32_clamp: - return NVPTXISD::Suld3DV2I32Clamp; - case Intrinsic::nvvm_suld_3d_v2i64_clamp: - return NVPTXISD::Suld3DV2I64Clamp; - case Intrinsic::nvvm_suld_3d_v4i8_clamp: - return NVPTXISD::Suld3DV4I8Clamp; - case Intrinsic::nvvm_suld_3d_v4i16_clamp: - return NVPTXISD::Suld3DV4I16Clamp; - case Intrinsic::nvvm_suld_3d_v4i32_clamp: - return NVPTXISD::Suld3DV4I32Clamp; - case Intrinsic::nvvm_suld_1d_i8_trap: - return NVPTXISD::Suld1DI8Trap; - case Intrinsic::nvvm_suld_1d_i16_trap: - return NVPTXISD::Suld1DI16Trap; - case Intrinsic::nvvm_suld_1d_i32_trap: - return NVPTXISD::Suld1DI32Trap; - case Intrinsic::nvvm_suld_1d_i64_trap: - return NVPTXISD::Suld1DI64Trap; - case Intrinsic::nvvm_suld_1d_v2i8_trap: - return NVPTXISD::Suld1DV2I8Trap; - case Intrinsic::nvvm_suld_1d_v2i16_trap: - return NVPTXISD::Suld1DV2I16Trap; - case Intrinsic::nvvm_suld_1d_v2i32_trap: - return NVPTXISD::Suld1DV2I32Trap; - case Intrinsic::nvvm_suld_1d_v2i64_trap: - return NVPTXISD::Suld1DV2I64Trap; - case Intrinsic::nvvm_suld_1d_v4i8_trap: - return NVPTXISD::Suld1DV4I8Trap; - case Intrinsic::nvvm_suld_1d_v4i16_trap: - return NVPTXISD::Suld1DV4I16Trap; - case Intrinsic::nvvm_suld_1d_v4i32_trap: - return NVPTXISD::Suld1DV4I32Trap; - case Intrinsic::nvvm_suld_1d_array_i8_trap: - return NVPTXISD::Suld1DArrayI8Trap; - case Intrinsic::nvvm_suld_1d_array_i16_trap: - return NVPTXISD::Suld1DArrayI16Trap; - case Intrinsic::nvvm_suld_1d_array_i32_trap: - return NVPTXISD::Suld1DArrayI32Trap; - case Intrinsic::nvvm_suld_1d_array_i64_trap: - return NVPTXISD::Suld1DArrayI64Trap; - case Intrinsic::nvvm_suld_1d_array_v2i8_trap: - return NVPTXISD::Suld1DArrayV2I8Trap; - case Intrinsic::nvvm_suld_1d_array_v2i16_trap: - return NVPTXISD::Suld1DArrayV2I16Trap; - case Intrinsic::nvvm_suld_1d_array_v2i32_trap: - return NVPTXISD::Suld1DArrayV2I32Trap; - case Intrinsic::nvvm_suld_1d_array_v2i64_trap: - return NVPTXISD::Suld1DArrayV2I64Trap; - case Intrinsic::nvvm_suld_1d_array_v4i8_trap: - return NVPTXISD::Suld1DArrayV4I8Trap; - case Intrinsic::nvvm_suld_1d_array_v4i16_trap: - return NVPTXISD::Suld1DArrayV4I16Trap; - case Intrinsic::nvvm_suld_1d_array_v4i32_trap: - return NVPTXISD::Suld1DArrayV4I32Trap; - case Intrinsic::nvvm_suld_2d_i8_trap: - return NVPTXISD::Suld2DI8Trap; - case Intrinsic::nvvm_suld_2d_i16_trap: - return NVPTXISD::Suld2DI16Trap; - case Intrinsic::nvvm_suld_2d_i32_trap: - return NVPTXISD::Suld2DI32Trap; - case Intrinsic::nvvm_suld_2d_i64_trap: - return NVPTXISD::Suld2DI64Trap; - case Intrinsic::nvvm_suld_2d_v2i8_trap: - return NVPTXISD::Suld2DV2I8Trap; - case Intrinsic::nvvm_suld_2d_v2i16_trap: - return NVPTXISD::Suld2DV2I16Trap; - case Intrinsic::nvvm_suld_2d_v2i32_trap: - return NVPTXISD::Suld2DV2I32Trap; - case Intrinsic::nvvm_suld_2d_v2i64_trap: - return NVPTXISD::Suld2DV2I64Trap; - case Intrinsic::nvvm_suld_2d_v4i8_trap: - return NVPTXISD::Suld2DV4I8Trap; - case Intrinsic::nvvm_suld_2d_v4i16_trap: - return NVPTXISD::Suld2DV4I16Trap; - case Intrinsic::nvvm_suld_2d_v4i32_trap: - return NVPTXISD::Suld2DV4I32Trap; - case Intrinsic::nvvm_suld_2d_array_i8_trap: - return NVPTXISD::Suld2DArrayI8Trap; - case Intrinsic::nvvm_suld_2d_array_i16_trap: - return NVPTXISD::Suld2DArrayI16Trap; - case Intrinsic::nvvm_suld_2d_array_i32_trap: - return NVPTXISD::Suld2DArrayI32Trap; - case Intrinsic::nvvm_suld_2d_array_i64_trap: - return NVPTXISD::Suld2DArrayI64Trap; - case Intrinsic::nvvm_suld_2d_array_v2i8_trap: - return NVPTXISD::Suld2DArrayV2I8Trap; - case Intrinsic::nvvm_suld_2d_array_v2i16_trap: - return NVPTXISD::Suld2DArrayV2I16Trap; - case Intrinsic::nvvm_suld_2d_array_v2i32_trap: - return NVPTXISD::Suld2DArrayV2I32Trap; - case Intrinsic::nvvm_suld_2d_array_v2i64_trap: - return NVPTXISD::Suld2DArrayV2I64Trap; - case Intrinsic::nvvm_suld_2d_array_v4i8_trap: - return NVPTXISD::Suld2DArrayV4I8Trap; - case Intrinsic::nvvm_suld_2d_array_v4i16_trap: - return NVPTXISD::Suld2DArrayV4I16Trap; - case Intrinsic::nvvm_suld_2d_array_v4i32_trap: - return NVPTXISD::Suld2DArrayV4I32Trap; - case Intrinsic::nvvm_suld_3d_i8_trap: - return NVPTXISD::Suld3DI8Trap; - case Intrinsic::nvvm_suld_3d_i16_trap: - return NVPTXISD::Suld3DI16Trap; - case Intrinsic::nvvm_suld_3d_i32_trap: - return NVPTXISD::Suld3DI32Trap; - case Intrinsic::nvvm_suld_3d_i64_trap: - return NVPTXISD::Suld3DI64Trap; - case Intrinsic::nvvm_suld_3d_v2i8_trap: - return NVPTXISD::Suld3DV2I8Trap; - case Intrinsic::nvvm_suld_3d_v2i16_trap: - return NVPTXISD::Suld3DV2I16Trap; - case Intrinsic::nvvm_suld_3d_v2i32_trap: - return NVPTXISD::Suld3DV2I32Trap; - case Intrinsic::nvvm_suld_3d_v2i64_trap: - return NVPTXISD::Suld3DV2I64Trap; - case Intrinsic::nvvm_suld_3d_v4i8_trap: - return NVPTXISD::Suld3DV4I8Trap; - case Intrinsic::nvvm_suld_3d_v4i16_trap: - return NVPTXISD::Suld3DV4I16Trap; - case Intrinsic::nvvm_suld_3d_v4i32_trap: - return NVPTXISD::Suld3DV4I32Trap; - case Intrinsic::nvvm_suld_1d_i8_zero: - return NVPTXISD::Suld1DI8Zero; - case Intrinsic::nvvm_suld_1d_i16_zero: - return NVPTXISD::Suld1DI16Zero; - case Intrinsic::nvvm_suld_1d_i32_zero: - return NVPTXISD::Suld1DI32Zero; - case Intrinsic::nvvm_suld_1d_i64_zero: - return NVPTXISD::Suld1DI64Zero; - case Intrinsic::nvvm_suld_1d_v2i8_zero: - return NVPTXISD::Suld1DV2I8Zero; - case Intrinsic::nvvm_suld_1d_v2i16_zero: - return NVPTXISD::Suld1DV2I16Zero; - case Intrinsic::nvvm_suld_1d_v2i32_zero: - return NVPTXISD::Suld1DV2I32Zero; - case Intrinsic::nvvm_suld_1d_v2i64_zero: - return NVPTXISD::Suld1DV2I64Zero; - case Intrinsic::nvvm_suld_1d_v4i8_zero: - return NVPTXISD::Suld1DV4I8Zero; - case Intrinsic::nvvm_suld_1d_v4i16_zero: - return NVPTXISD::Suld1DV4I16Zero; - case Intrinsic::nvvm_suld_1d_v4i32_zero: - return NVPTXISD::Suld1DV4I32Zero; - case Intrinsic::nvvm_suld_1d_array_i8_zero: - return NVPTXISD::Suld1DArrayI8Zero; - case Intrinsic::nvvm_suld_1d_array_i16_zero: - return NVPTXISD::Suld1DArrayI16Zero; - case Intrinsic::nvvm_suld_1d_array_i32_zero: - return NVPTXISD::Suld1DArrayI32Zero; - case Intrinsic::nvvm_suld_1d_array_i64_zero: - return NVPTXISD::Suld1DArrayI64Zero; - case Intrinsic::nvvm_suld_1d_array_v2i8_zero: - return NVPTXISD::Suld1DArrayV2I8Zero; - case Intrinsic::nvvm_suld_1d_array_v2i16_zero: - return NVPTXISD::Suld1DArrayV2I16Zero; - case Intrinsic::nvvm_suld_1d_array_v2i32_zero: - return NVPTXISD::Suld1DArrayV2I32Zero; - case Intrinsic::nvvm_suld_1d_array_v2i64_zero: - return NVPTXISD::Suld1DArrayV2I64Zero; - case Intrinsic::nvvm_suld_1d_array_v4i8_zero: - return NVPTXISD::Suld1DArrayV4I8Zero; - case Intrinsic::nvvm_suld_1d_array_v4i16_zero: - return NVPTXISD::Suld1DArrayV4I16Zero; - case Intrinsic::nvvm_suld_1d_array_v4i32_zero: - return NVPTXISD::Suld1DArrayV4I32Zero; - case Intrinsic::nvvm_suld_2d_i8_zero: - return NVPTXISD::Suld2DI8Zero; - case Intrinsic::nvvm_suld_2d_i16_zero: - return NVPTXISD::Suld2DI16Zero; - case Intrinsic::nvvm_suld_2d_i32_zero: - return NVPTXISD::Suld2DI32Zero; - case Intrinsic::nvvm_suld_2d_i64_zero: - return NVPTXISD::Suld2DI64Zero; - case Intrinsic::nvvm_suld_2d_v2i8_zero: - return NVPTXISD::Suld2DV2I8Zero; - case Intrinsic::nvvm_suld_2d_v2i16_zero: - return NVPTXISD::Suld2DV2I16Zero; - case Intrinsic::nvvm_suld_2d_v2i32_zero: - return NVPTXISD::Suld2DV2I32Zero; - case Intrinsic::nvvm_suld_2d_v2i64_zero: - return NVPTXISD::Suld2DV2I64Zero; - case Intrinsic::nvvm_suld_2d_v4i8_zero: - return NVPTXISD::Suld2DV4I8Zero; - case Intrinsic::nvvm_suld_2d_v4i16_zero: - return NVPTXISD::Suld2DV4I16Zero; - case Intrinsic::nvvm_suld_2d_v4i32_zero: - return NVPTXISD::Suld2DV4I32Zero; - case Intrinsic::nvvm_suld_2d_array_i8_zero: - return NVPTXISD::Suld2DArrayI8Zero; - case Intrinsic::nvvm_suld_2d_array_i16_zero: - return NVPTXISD::Suld2DArrayI16Zero; - case Intrinsic::nvvm_suld_2d_array_i32_zero: - return NVPTXISD::Suld2DArrayI32Zero; - case Intrinsic::nvvm_suld_2d_array_i64_zero: - return NVPTXISD::Suld2DArrayI64Zero; - case Intrinsic::nvvm_suld_2d_array_v2i8_zero: - return NVPTXISD::Suld2DArrayV2I8Zero; - case Intrinsic::nvvm_suld_2d_array_v2i16_zero: - return NVPTXISD::Suld2DArrayV2I16Zero; - case Intrinsic::nvvm_suld_2d_array_v2i32_zero: - return NVPTXISD::Suld2DArrayV2I32Zero; - case Intrinsic::nvvm_suld_2d_array_v2i64_zero: - return NVPTXISD::Suld2DArrayV2I64Zero; - case Intrinsic::nvvm_suld_2d_array_v4i8_zero: - return NVPTXISD::Suld2DArrayV4I8Zero; - case Intrinsic::nvvm_suld_2d_array_v4i16_zero: - return NVPTXISD::Suld2DArrayV4I16Zero; - case Intrinsic::nvvm_suld_2d_array_v4i32_zero: - return NVPTXISD::Suld2DArrayV4I32Zero; - case Intrinsic::nvvm_suld_3d_i8_zero: - return NVPTXISD::Suld3DI8Zero; - case Intrinsic::nvvm_suld_3d_i16_zero: - return NVPTXISD::Suld3DI16Zero; - case Intrinsic::nvvm_suld_3d_i32_zero: - return NVPTXISD::Suld3DI32Zero; - case Intrinsic::nvvm_suld_3d_i64_zero: - return NVPTXISD::Suld3DI64Zero; - case Intrinsic::nvvm_suld_3d_v2i8_zero: - return NVPTXISD::Suld3DV2I8Zero; - case Intrinsic::nvvm_suld_3d_v2i16_zero: - return NVPTXISD::Suld3DV2I16Zero; - case Intrinsic::nvvm_suld_3d_v2i32_zero: - return NVPTXISD::Suld3DV2I32Zero; - case Intrinsic::nvvm_suld_3d_v2i64_zero: - return NVPTXISD::Suld3DV2I64Zero; - case Intrinsic::nvvm_suld_3d_v4i8_zero: - return NVPTXISD::Suld3DV4I8Zero; - case Intrinsic::nvvm_suld_3d_v4i16_zero: - return NVPTXISD::Suld3DV4I16Zero; - case Intrinsic::nvvm_suld_3d_v4i32_zero: - return NVPTXISD::Suld3DV4I32Zero; - } -} - -// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as -// TgtMemIntrinsic -// because we need the information that is only available in the "Value" type -// of destination -// pointer. In particular, the address space information. -bool NVPTXTargetLowering::getTgtMemIntrinsic( - IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const { - switch (Intrinsic) { - default: - return false; - - case Intrinsic::nvvm_atomic_load_add_f32: - case Intrinsic::nvvm_atomic_load_inc_32: - case Intrinsic::nvvm_atomic_load_dec_32: - - case Intrinsic::nvvm_atomic_add_gen_f_cta: - case Intrinsic::nvvm_atomic_add_gen_f_sys: - case Intrinsic::nvvm_atomic_add_gen_i_cta: - case Intrinsic::nvvm_atomic_add_gen_i_sys: - case Intrinsic::nvvm_atomic_and_gen_i_cta: - case Intrinsic::nvvm_atomic_and_gen_i_sys: - case Intrinsic::nvvm_atomic_cas_gen_i_cta: - case Intrinsic::nvvm_atomic_cas_gen_i_sys: - case Intrinsic::nvvm_atomic_dec_gen_i_cta: - case Intrinsic::nvvm_atomic_dec_gen_i_sys: - case Intrinsic::nvvm_atomic_inc_gen_i_cta: - case Intrinsic::nvvm_atomic_inc_gen_i_sys: - case Intrinsic::nvvm_atomic_max_gen_i_cta: - case Intrinsic::nvvm_atomic_max_gen_i_sys: - case Intrinsic::nvvm_atomic_min_gen_i_cta: - case Intrinsic::nvvm_atomic_min_gen_i_sys: - case Intrinsic::nvvm_atomic_or_gen_i_cta: - case Intrinsic::nvvm_atomic_or_gen_i_sys: - case Intrinsic::nvvm_atomic_exch_gen_i_cta: - case Intrinsic::nvvm_atomic_exch_gen_i_sys: - case Intrinsic::nvvm_atomic_xor_gen_i_cta: - case Intrinsic::nvvm_atomic_xor_gen_i_sys: { - auto &DL = I.getModule()->getDataLayout(); - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = getValueType(DL, I.getType()); - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.vol = false; - Info.readMem = true; - Info.writeMem = true; - Info.align = 0; - return true; - } - - case Intrinsic::nvvm_ldu_global_i: - case Intrinsic::nvvm_ldu_global_f: - case Intrinsic::nvvm_ldu_global_p: { - auto &DL = I.getModule()->getDataLayout(); - Info.opc = ISD::INTRINSIC_W_CHAIN; - if (Intrinsic == Intrinsic::nvvm_ldu_global_i) - Info.memVT = getValueType(DL, I.getType()); - else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) - Info.memVT = getPointerTy(DL); - else - Info.memVT = getValueType(DL, I.getType()); - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; - Info.align = cast(I.getArgOperand(1))->getZExtValue(); - - return true; - } - case Intrinsic::nvvm_ldg_global_i: - case Intrinsic::nvvm_ldg_global_f: - case Intrinsic::nvvm_ldg_global_p: { - auto &DL = I.getModule()->getDataLayout(); - - Info.opc = ISD::INTRINSIC_W_CHAIN; - if (Intrinsic == Intrinsic::nvvm_ldg_global_i) - Info.memVT = getValueType(DL, I.getType()); - else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) - Info.memVT = getPointerTy(DL); - else - Info.memVT = getValueType(DL, I.getType()); - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; - Info.align = cast(I.getArgOperand(1))->getZExtValue(); - - return true; - } - - case Intrinsic::nvvm_tex_1d_v4f32_s32: - case Intrinsic::nvvm_tex_1d_v4f32_f32: - case Intrinsic::nvvm_tex_1d_level_v4f32_f32: - case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: - case Intrinsic::nvvm_tex_1d_array_v4f32_s32: - case Intrinsic::nvvm_tex_1d_array_v4f32_f32: - case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: - case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: - case Intrinsic::nvvm_tex_2d_v4f32_s32: - case Intrinsic::nvvm_tex_2d_v4f32_f32: - case Intrinsic::nvvm_tex_2d_level_v4f32_f32: - case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: - case Intrinsic::nvvm_tex_2d_array_v4f32_s32: - case Intrinsic::nvvm_tex_2d_array_v4f32_f32: - case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: - case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: - case Intrinsic::nvvm_tex_3d_v4f32_s32: - case Intrinsic::nvvm_tex_3d_v4f32_f32: - case Intrinsic::nvvm_tex_3d_level_v4f32_f32: - case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: - case Intrinsic::nvvm_tex_cube_v4f32_f32: - case Intrinsic::nvvm_tex_cube_level_v4f32_f32: - case Intrinsic::nvvm_tex_cube_array_v4f32_f32: - case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: - case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: - case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: - case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: - case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: - case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: - case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: - case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: - case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: - case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: - case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: - case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: - case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: - case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: - case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: - case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: - case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: - case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: - case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: - case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: - case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: - case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: - case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: - case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: - case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: - case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: - case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: - case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: - case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: - case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: - case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: - case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: - case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: - Info.opc = getOpcForTextureInstr(Intrinsic); - Info.memVT = MVT::v4f32; - Info.ptrVal = nullptr; - Info.offset = 0; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; - Info.align = 16; - return true; - - case Intrinsic::nvvm_tex_1d_v4s32_s32: - case Intrinsic::nvvm_tex_1d_v4s32_f32: - case Intrinsic::nvvm_tex_1d_level_v4s32_f32: - case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: - case Intrinsic::nvvm_tex_1d_array_v4s32_s32: - case Intrinsic::nvvm_tex_1d_array_v4s32_f32: - case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: - case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: - case Intrinsic::nvvm_tex_2d_v4s32_s32: - case Intrinsic::nvvm_tex_2d_v4s32_f32: - case Intrinsic::nvvm_tex_2d_level_v4s32_f32: - case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: - case Intrinsic::nvvm_tex_2d_array_v4s32_s32: - case Intrinsic::nvvm_tex_2d_array_v4s32_f32: - case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: - case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: - case Intrinsic::nvvm_tex_3d_v4s32_s32: - case Intrinsic::nvvm_tex_3d_v4s32_f32: - case Intrinsic::nvvm_tex_3d_level_v4s32_f32: - case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: - case Intrinsic::nvvm_tex_cube_v4s32_f32: - case Intrinsic::nvvm_tex_cube_level_v4s32_f32: - case Intrinsic::nvvm_tex_cube_array_v4s32_f32: - case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: - case Intrinsic::nvvm_tex_cube_v4u32_f32: - case Intrinsic::nvvm_tex_cube_level_v4u32_f32: - case Intrinsic::nvvm_tex_cube_array_v4u32_f32: - case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: - case Intrinsic::nvvm_tex_1d_v4u32_s32: - case Intrinsic::nvvm_tex_1d_v4u32_f32: - case Intrinsic::nvvm_tex_1d_level_v4u32_f32: - case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: - case Intrinsic::nvvm_tex_1d_array_v4u32_s32: - case Intrinsic::nvvm_tex_1d_array_v4u32_f32: - case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: - case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: - case Intrinsic::nvvm_tex_2d_v4u32_s32: - case Intrinsic::nvvm_tex_2d_v4u32_f32: - case Intrinsic::nvvm_tex_2d_level_v4u32_f32: - case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: - case Intrinsic::nvvm_tex_2d_array_v4u32_s32: - case Intrinsic::nvvm_tex_2d_array_v4u32_f32: - case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: - case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: - case Intrinsic::nvvm_tex_3d_v4u32_s32: - case Intrinsic::nvvm_tex_3d_v4u32_f32: - case Intrinsic::nvvm_tex_3d_level_v4u32_f32: - case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: - case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: - case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: - case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: - case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: - case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: - case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: - case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: - case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: - case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: - case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: - case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: - case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: - case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: - case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: - case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: - case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: - case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: - case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: - case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: - case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: - case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: - case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: - case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: - case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: - case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: - case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: - case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: - case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: - case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: - case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: - case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: - case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: - case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: - case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: - case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: - case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: - case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: - case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: - case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: - case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: - case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: - case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: - case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: - case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: - case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: - case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: - case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: - case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: - case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: - case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: - case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: - case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: - case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: - case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: - case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: - case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: - case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: - case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: - case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: - case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: - case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: - case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: - case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: - case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: - Info.opc = getOpcForTextureInstr(Intrinsic); - Info.memVT = MVT::v4i32; - Info.ptrVal = nullptr; - Info.offset = 0; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; - Info.align = 16; - return true; - - case Intrinsic::nvvm_suld_1d_i8_clamp: - case Intrinsic::nvvm_suld_1d_v2i8_clamp: - case Intrinsic::nvvm_suld_1d_v4i8_clamp: - case Intrinsic::nvvm_suld_1d_array_i8_clamp: - case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: - case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: - case Intrinsic::nvvm_suld_2d_i8_clamp: - case Intrinsic::nvvm_suld_2d_v2i8_clamp: - case Intrinsic::nvvm_suld_2d_v4i8_clamp: - case Intrinsic::nvvm_suld_2d_array_i8_clamp: - case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: - case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: - case Intrinsic::nvvm_suld_3d_i8_clamp: - case Intrinsic::nvvm_suld_3d_v2i8_clamp: - case Intrinsic::nvvm_suld_3d_v4i8_clamp: - case Intrinsic::nvvm_suld_1d_i8_trap: - case Intrinsic::nvvm_suld_1d_v2i8_trap: - case Intrinsic::nvvm_suld_1d_v4i8_trap: - case Intrinsic::nvvm_suld_1d_array_i8_trap: - case Intrinsic::nvvm_suld_1d_array_v2i8_trap: - case Intrinsic::nvvm_suld_1d_array_v4i8_trap: - case Intrinsic::nvvm_suld_2d_i8_trap: - case Intrinsic::nvvm_suld_2d_v2i8_trap: - case Intrinsic::nvvm_suld_2d_v4i8_trap: - case Intrinsic::nvvm_suld_2d_array_i8_trap: - case Intrinsic::nvvm_suld_2d_array_v2i8_trap: - case Intrinsic::nvvm_suld_2d_array_v4i8_trap: - case Intrinsic::nvvm_suld_3d_i8_trap: - case Intrinsic::nvvm_suld_3d_v2i8_trap: - case Intrinsic::nvvm_suld_3d_v4i8_trap: - case Intrinsic::nvvm_suld_1d_i8_zero: - case Intrinsic::nvvm_suld_1d_v2i8_zero: - case Intrinsic::nvvm_suld_1d_v4i8_zero: - case Intrinsic::nvvm_suld_1d_array_i8_zero: - case Intrinsic::nvvm_suld_1d_array_v2i8_zero: - case Intrinsic::nvvm_suld_1d_array_v4i8_zero: - case Intrinsic::nvvm_suld_2d_i8_zero: - case Intrinsic::nvvm_suld_2d_v2i8_zero: - case Intrinsic::nvvm_suld_2d_v4i8_zero: - case Intrinsic::nvvm_suld_2d_array_i8_zero: - case Intrinsic::nvvm_suld_2d_array_v2i8_zero: - case Intrinsic::nvvm_suld_2d_array_v4i8_zero: - case Intrinsic::nvvm_suld_3d_i8_zero: - case Intrinsic::nvvm_suld_3d_v2i8_zero: - case Intrinsic::nvvm_suld_3d_v4i8_zero: - Info.opc = getOpcForSurfaceInstr(Intrinsic); - Info.memVT = MVT::i8; - Info.ptrVal = nullptr; - Info.offset = 0; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; - Info.align = 16; - return true; - - case Intrinsic::nvvm_suld_1d_i16_clamp: - case Intrinsic::nvvm_suld_1d_v2i16_clamp: - case Intrinsic::nvvm_suld_1d_v4i16_clamp: - case Intrinsic::nvvm_suld_1d_array_i16_clamp: - case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: - case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: - case Intrinsic::nvvm_suld_2d_i16_clamp: - case Intrinsic::nvvm_suld_2d_v2i16_clamp: - case Intrinsic::nvvm_suld_2d_v4i16_clamp: - case Intrinsic::nvvm_suld_2d_array_i16_clamp: - case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: - case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: - case Intrinsic::nvvm_suld_3d_i16_clamp: - case Intrinsic::nvvm_suld_3d_v2i16_clamp: - case Intrinsic::nvvm_suld_3d_v4i16_clamp: - case Intrinsic::nvvm_suld_1d_i16_trap: - case Intrinsic::nvvm_suld_1d_v2i16_trap: - case Intrinsic::nvvm_suld_1d_v4i16_trap: - case Intrinsic::nvvm_suld_1d_array_i16_trap: - case Intrinsic::nvvm_suld_1d_array_v2i16_trap: - case Intrinsic::nvvm_suld_1d_array_v4i16_trap: - case Intrinsic::nvvm_suld_2d_i16_trap: - case Intrinsic::nvvm_suld_2d_v2i16_trap: - case Intrinsic::nvvm_suld_2d_v4i16_trap: - case Intrinsic::nvvm_suld_2d_array_i16_trap: - case Intrinsic::nvvm_suld_2d_array_v2i16_trap: - case Intrinsic::nvvm_suld_2d_array_v4i16_trap: - case Intrinsic::nvvm_suld_3d_i16_trap: - case Intrinsic::nvvm_suld_3d_v2i16_trap: - case Intrinsic::nvvm_suld_3d_v4i16_trap: - case Intrinsic::nvvm_suld_1d_i16_zero: - case Intrinsic::nvvm_suld_1d_v2i16_zero: - case Intrinsic::nvvm_suld_1d_v4i16_zero: - case Intrinsic::nvvm_suld_1d_array_i16_zero: - case Intrinsic::nvvm_suld_1d_array_v2i16_zero: - case Intrinsic::nvvm_suld_1d_array_v4i16_zero: - case Intrinsic::nvvm_suld_2d_i16_zero: - case Intrinsic::nvvm_suld_2d_v2i16_zero: - case Intrinsic::nvvm_suld_2d_v4i16_zero: - case Intrinsic::nvvm_suld_2d_array_i16_zero: - case Intrinsic::nvvm_suld_2d_array_v2i16_zero: - case Intrinsic::nvvm_suld_2d_array_v4i16_zero: - case Intrinsic::nvvm_suld_3d_i16_zero: - case Intrinsic::nvvm_suld_3d_v2i16_zero: - case Intrinsic::nvvm_suld_3d_v4i16_zero: - Info.opc = getOpcForSurfaceInstr(Intrinsic); - Info.memVT = MVT::i16; - Info.ptrVal = nullptr; - Info.offset = 0; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; - Info.align = 16; - return true; - - case Intrinsic::nvvm_suld_1d_i32_clamp: - case Intrinsic::nvvm_suld_1d_v2i32_clamp: - case Intrinsic::nvvm_suld_1d_v4i32_clamp: - case Intrinsic::nvvm_suld_1d_array_i32_clamp: - case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: - case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: - case Intrinsic::nvvm_suld_2d_i32_clamp: - case Intrinsic::nvvm_suld_2d_v2i32_clamp: - case Intrinsic::nvvm_suld_2d_v4i32_clamp: - case Intrinsic::nvvm_suld_2d_array_i32_clamp: - case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: - case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: - case Intrinsic::nvvm_suld_3d_i32_clamp: - case Intrinsic::nvvm_suld_3d_v2i32_clamp: - case Intrinsic::nvvm_suld_3d_v4i32_clamp: - case Intrinsic::nvvm_suld_1d_i32_trap: - case Intrinsic::nvvm_suld_1d_v2i32_trap: - case Intrinsic::nvvm_suld_1d_v4i32_trap: - case Intrinsic::nvvm_suld_1d_array_i32_trap: - case Intrinsic::nvvm_suld_1d_array_v2i32_trap: - case Intrinsic::nvvm_suld_1d_array_v4i32_trap: - case Intrinsic::nvvm_suld_2d_i32_trap: - case Intrinsic::nvvm_suld_2d_v2i32_trap: - case Intrinsic::nvvm_suld_2d_v4i32_trap: - case Intrinsic::nvvm_suld_2d_array_i32_trap: - case Intrinsic::nvvm_suld_2d_array_v2i32_trap: - case Intrinsic::nvvm_suld_2d_array_v4i32_trap: - case Intrinsic::nvvm_suld_3d_i32_trap: - case Intrinsic::nvvm_suld_3d_v2i32_trap: - case Intrinsic::nvvm_suld_3d_v4i32_trap: - case Intrinsic::nvvm_suld_1d_i32_zero: - case Intrinsic::nvvm_suld_1d_v2i32_zero: - case Intrinsic::nvvm_suld_1d_v4i32_zero: - case Intrinsic::nvvm_suld_1d_array_i32_zero: - case Intrinsic::nvvm_suld_1d_array_v2i32_zero: - case Intrinsic::nvvm_suld_1d_array_v4i32_zero: - case Intrinsic::nvvm_suld_2d_i32_zero: - case Intrinsic::nvvm_suld_2d_v2i32_zero: - case Intrinsic::nvvm_suld_2d_v4i32_zero: - case Intrinsic::nvvm_suld_2d_array_i32_zero: - case Intrinsic::nvvm_suld_2d_array_v2i32_zero: - case Intrinsic::nvvm_suld_2d_array_v4i32_zero: - case Intrinsic::nvvm_suld_3d_i32_zero: - case Intrinsic::nvvm_suld_3d_v2i32_zero: - case Intrinsic::nvvm_suld_3d_v4i32_zero: - Info.opc = getOpcForSurfaceInstr(Intrinsic); - Info.memVT = MVT::i32; - Info.ptrVal = nullptr; - Info.offset = 0; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; - Info.align = 16; - return true; - - case Intrinsic::nvvm_suld_1d_i64_clamp: - case Intrinsic::nvvm_suld_1d_v2i64_clamp: - case Intrinsic::nvvm_suld_1d_array_i64_clamp: - case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: - case Intrinsic::nvvm_suld_2d_i64_clamp: - case Intrinsic::nvvm_suld_2d_v2i64_clamp: - case Intrinsic::nvvm_suld_2d_array_i64_clamp: - case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: - case Intrinsic::nvvm_suld_3d_i64_clamp: - case Intrinsic::nvvm_suld_3d_v2i64_clamp: - case Intrinsic::nvvm_suld_1d_i64_trap: - case Intrinsic::nvvm_suld_1d_v2i64_trap: - case Intrinsic::nvvm_suld_1d_array_i64_trap: - case Intrinsic::nvvm_suld_1d_array_v2i64_trap: - case Intrinsic::nvvm_suld_2d_i64_trap: - case Intrinsic::nvvm_suld_2d_v2i64_trap: - case Intrinsic::nvvm_suld_2d_array_i64_trap: - case Intrinsic::nvvm_suld_2d_array_v2i64_trap: - case Intrinsic::nvvm_suld_3d_i64_trap: - case Intrinsic::nvvm_suld_3d_v2i64_trap: - case Intrinsic::nvvm_suld_1d_i64_zero: - case Intrinsic::nvvm_suld_1d_v2i64_zero: - case Intrinsic::nvvm_suld_1d_array_i64_zero: - case Intrinsic::nvvm_suld_1d_array_v2i64_zero: - case Intrinsic::nvvm_suld_2d_i64_zero: - case Intrinsic::nvvm_suld_2d_v2i64_zero: - case Intrinsic::nvvm_suld_2d_array_i64_zero: - case Intrinsic::nvvm_suld_2d_array_v2i64_zero: - case Intrinsic::nvvm_suld_3d_i64_zero: - case Intrinsic::nvvm_suld_3d_v2i64_zero: - Info.opc = getOpcForSurfaceInstr(Intrinsic); - Info.memVT = MVT::i64; - Info.ptrVal = nullptr; - Info.offset = 0; - Info.vol = false; - Info.readMem = true; - Info.writeMem = false; - Info.align = 16; - return true; - } - return false; -} - -/// isLegalAddressingMode - Return true if the addressing mode represented -/// by AM is legal for this target, for a load/store of the specified type. -/// Used to guide target specific optimizations, like loop strength reduction -/// (LoopStrengthReduce.cpp) and memory optimization for address mode -/// (CodeGenPrepare.cpp) -bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, - const AddrMode &AM, Type *Ty, - unsigned AS) const { - // AddrMode - This represents an addressing mode of: - // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg - // - // The legal address modes are - // - [avar] - // - [areg] - // - [areg+immoff] - // - [immAddr] - - if (AM.BaseGV) { - return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; - } - - switch (AM.Scale) { - case 0: // "r", "r+i" or "i" is allowed - break; - case 1: - if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. - return false; - // Otherwise we have r+i. - break; - default: - // No scale > 1 is allowed - return false; - } - return true; -} - -//===----------------------------------------------------------------------===// -// NVPTX Inline Assembly Support -//===----------------------------------------------------------------------===// - -/// getConstraintType - Given a constraint letter, return the type of -/// constraint it is for this target. -NVPTXTargetLowering::ConstraintType -NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { - if (Constraint.size() == 1) { - switch (Constraint[0]) { - default: - break; - case 'b': - case 'r': - case 'h': - case 'c': - case 'l': - case 'f': - case 'd': - case '0': - case 'N': - return C_RegisterClass; - } - } - return TargetLowering::getConstraintType(Constraint); -} - -std::pair -NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - StringRef Constraint, - MVT VT) const { - if (Constraint.size() == 1) { - switch (Constraint[0]) { - case 'b': - return std::make_pair(0U, &NVPTX::Int1RegsRegClass); - case 'c': - return std::make_pair(0U, &NVPTX::Int16RegsRegClass); - case 'h': - return std::make_pair(0U, &NVPTX::Int16RegsRegClass); - case 'r': - return std::make_pair(0U, &NVPTX::Int32RegsRegClass); - case 'l': - case 'N': - return std::make_pair(0U, &NVPTX::Int64RegsRegClass); - case 'f': - return std::make_pair(0U, &NVPTX::Float32RegsRegClass); - case 'd': - return std::make_pair(0U, &NVPTX::Float64RegsRegClass); - } - } - return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); -} - -//===----------------------------------------------------------------------===// -// NVPTX DAG Combining -//===----------------------------------------------------------------------===// - -bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, - CodeGenOpt::Level OptLevel) const { - // Always honor command-line argument - if (FMAContractLevelOpt.getNumOccurrences() > 0) - return FMAContractLevelOpt > 0; - - // Do not contract if we're not optimizing the code. - if (OptLevel == 0) - return false; - - // Honor TargetOptions flags that explicitly say fusion is okay. - if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) - return true; - - return allowUnsafeFPMath(MF); -} - -bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { - // Honor TargetOptions flags that explicitly say unsafe math is okay. - if (MF.getTarget().Options.UnsafeFPMath) - return true; - - // Allow unsafe math if unsafe-fp-math attribute explicitly says so. - const Function *F = MF.getFunction(); - if (F->hasFnAttribute("unsafe-fp-math")) { - Attribute Attr = F->getFnAttribute("unsafe-fp-math"); - StringRef Val = Attr.getValueAsString(); - if (Val == "true") - return true; - } - - return false; -} - -/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with -/// operands N0 and N1. This is a helper for PerformADDCombine that is -/// called with the default operands, and if that fails, with commuted -/// operands. -static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, - TargetLowering::DAGCombinerInfo &DCI, - const NVPTXSubtarget &Subtarget, - CodeGenOpt::Level OptLevel) { - SelectionDAG &DAG = DCI.DAG; - // Skip non-integer, non-scalar case - EVT VT=N0.getValueType(); - if (VT.isVector()) - return SDValue(); - - // fold (add (mul a, b), c) -> (mad a, b, c) - // - if (N0.getOpcode() == ISD::MUL) { - assert (VT.isInteger()); - // For integer: - // Since integer multiply-add costs the same as integer multiply - // but is more costly than integer add, do the fusion only when - // the mul is only used in the add. - if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || - !N0.getNode()->hasOneUse()) - return SDValue(); - - // Do the folding - return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, - N0.getOperand(0), N0.getOperand(1), N1); - } - else if (N0.getOpcode() == ISD::FMUL) { - if (VT == MVT::f32 || VT == MVT::f64) { - const auto *TLI = static_cast( - &DAG.getTargetLoweringInfo()); - if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) - return SDValue(); - - // For floating point: - // Do the fusion only when the mul has less than 5 uses and all - // are add. - // The heuristic is that if a use is not an add, then that use - // cannot be fused into fma, therefore mul is still needed anyway. - // If there are more than 4 uses, even if they are all add, fusing - // them will increase register pressue. - // - int numUses = 0; - int nonAddCount = 0; - for (SDNode::use_iterator UI = N0.getNode()->use_begin(), - UE = N0.getNode()->use_end(); - UI != UE; ++UI) { - numUses++; - SDNode *User = *UI; - if (User->getOpcode() != ISD::FADD) - ++nonAddCount; - } - if (numUses >= 5) - return SDValue(); - if (nonAddCount) { - int orderNo = N->getIROrder(); - int orderNo2 = N0.getNode()->getIROrder(); - // simple heuristics here for considering potential register - // pressure, the logics here is that the differnce are used - // to measure the distance between def and use, the longer distance - // more likely cause register pressure. - if (orderNo - orderNo2 < 500) - return SDValue(); - - // Now, check if at least one of the FMUL's operands is live beyond the node N, - // which guarantees that the FMA will not increase register pressure at node N. - bool opIsLive = false; - const SDNode *left = N0.getOperand(0).getNode(); - const SDNode *right = N0.getOperand(1).getNode(); - - if (isa(left) || isa(right)) - opIsLive = true; - - if (!opIsLive) - for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - int orderNo3 = User->getIROrder(); - if (orderNo3 > orderNo) { - opIsLive = true; - break; - } - } - - if (!opIsLive) - for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - int orderNo3 = User->getIROrder(); - if (orderNo3 > orderNo) { - opIsLive = true; - break; - } - } - - if (!opIsLive) - return SDValue(); - } - - return DAG.getNode(ISD::FMA, SDLoc(N), VT, - N0.getOperand(0), N0.getOperand(1), N1); - } - } - - return SDValue(); -} - -/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. -/// -static SDValue PerformADDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const NVPTXSubtarget &Subtarget, - CodeGenOpt::Level OptLevel) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - // First try with the default operand order. - if (SDValue Result = - PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) - return Result; - - // If that didn't work, try again with the operands commuted. - return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); -} - -static SDValue PerformANDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - // The type legalizer turns a vector load of i8 values into a zextload to i16 - // registers, optionally ANY_EXTENDs it (if target type is integer), - // and ANDs off the high 8 bits. Since we turn this load into a - // target-specific DAG node, the DAG combiner fails to eliminate these AND - // nodes. Do that here. - SDValue Val = N->getOperand(0); - SDValue Mask = N->getOperand(1); - - if (isa(Val)) { - std::swap(Val, Mask); - } - - SDValue AExt; - // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and - if (Val.getOpcode() == ISD::ANY_EXTEND) { - AExt = Val; - Val = Val->getOperand(0); - } - - if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { - Val = Val->getOperand(0); - } - - if (Val->getOpcode() == NVPTXISD::LoadV2 || - Val->getOpcode() == NVPTXISD::LoadV4) { - ConstantSDNode *MaskCnst = dyn_cast(Mask); - if (!MaskCnst) { - // Not an AND with a constant - return SDValue(); - } - - uint64_t MaskVal = MaskCnst->getZExtValue(); - if (MaskVal != 0xff) { - // Not an AND that chops off top 8 bits - return SDValue(); - } - - MemSDNode *Mem = dyn_cast(Val); - if (!Mem) { - // Not a MemSDNode?!? - return SDValue(); - } - - EVT MemVT = Mem->getMemoryVT(); - if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { - // We only handle the i8 case - return SDValue(); - } - - unsigned ExtType = - cast(Val->getOperand(Val->getNumOperands()-1))-> - getZExtValue(); - if (ExtType == ISD::SEXTLOAD) { - // If for some reason the load is a sextload, the and is needed to zero - // out the high 8 bits - return SDValue(); - } - - bool AddTo = false; - if (AExt.getNode() != nullptr) { - // Re-insert the ext as a zext. - Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), - AExt.getValueType(), Val); - AddTo = true; - } - - // If we get here, the AND is unnecessary. Just replace it with the load - DCI.CombineTo(N, Val, AddTo); - } - - return SDValue(); -} - -static SDValue PerformREMCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - CodeGenOpt::Level OptLevel) { - assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); - - // Don't do anything at less than -O2. - if (OptLevel < CodeGenOpt::Default) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - EVT VT = N->getValueType(0); - bool IsSigned = N->getOpcode() == ISD::SREM; - unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; - - const SDValue &Num = N->getOperand(0); - const SDValue &Den = N->getOperand(1); - - for (const SDNode *U : Num->uses()) { - if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && - U->getOperand(1) == Den) { - // Num % Den -> Num - (Num / Den) * Den - return DAG.getNode(ISD::SUB, DL, VT, Num, - DAG.getNode(ISD::MUL, DL, VT, - DAG.getNode(DivOpc, DL, VT, Num, Den), - Den)); - } - } - return SDValue(); -} - -enum OperandSignedness { - Signed = 0, - Unsigned, - Unknown -}; - -/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand -/// that can be demoted to \p OptSize bits without loss of information. The -/// signedness of the operand, if determinable, is placed in \p S. -static bool IsMulWideOperandDemotable(SDValue Op, - unsigned OptSize, - OperandSignedness &S) { - S = Unknown; - - if (Op.getOpcode() == ISD::SIGN_EXTEND || - Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { - EVT OrigVT = Op.getOperand(0).getValueType(); - if (OrigVT.getSizeInBits() <= OptSize) { - S = Signed; - return true; - } - } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { - EVT OrigVT = Op.getOperand(0).getValueType(); - if (OrigVT.getSizeInBits() <= OptSize) { - S = Unsigned; - return true; - } - } - - return false; -} - -/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can -/// be demoted to \p OptSize bits without loss of information. If the operands -/// contain a constant, it should appear as the RHS operand. The signedness of -/// the operands is placed in \p IsSigned. -static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, - unsigned OptSize, - bool &IsSigned) { - OperandSignedness LHSSign; - - // The LHS operand must be a demotable op - if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) - return false; - - // We should have been able to determine the signedness from the LHS - if (LHSSign == Unknown) - return false; - - IsSigned = (LHSSign == Signed); - - // The RHS can be a demotable op or a constant - if (ConstantSDNode *CI = dyn_cast(RHS)) { - const APInt &Val = CI->getAPIntValue(); - if (LHSSign == Unsigned) { - return Val.isIntN(OptSize); - } else { - return Val.isSignedIntN(OptSize); - } - } else { - OperandSignedness RHSSign; - if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) - return false; - - return LHSSign == RHSSign; - } -} - -/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply -/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform -/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift -/// amount. -static SDValue TryMULWIDECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - EVT MulType = N->getValueType(0); - if (MulType != MVT::i32 && MulType != MVT::i64) { - return SDValue(); - } - - SDLoc DL(N); - unsigned OptSize = MulType.getSizeInBits() >> 1; - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - // Canonicalize the multiply so the constant (if any) is on the right - if (N->getOpcode() == ISD::MUL) { - if (isa(LHS)) { - std::swap(LHS, RHS); - } - } - - // If we have a SHL, determine the actual multiply amount - if (N->getOpcode() == ISD::SHL) { - ConstantSDNode *ShlRHS = dyn_cast(RHS); - if (!ShlRHS) { - return SDValue(); - } - - APInt ShiftAmt = ShlRHS->getAPIntValue(); - unsigned BitWidth = MulType.getSizeInBits(); - if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { - APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; - RHS = DCI.DAG.getConstant(MulVal, DL, MulType); - } else { - return SDValue(); - } - } - - bool Signed; - // Verify that our operands are demotable - if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { - return SDValue(); - } - - EVT DemotedVT; - if (MulType == MVT::i32) { - DemotedVT = MVT::i16; - } else { - DemotedVT = MVT::i32; - } - - // Truncate the operands to the correct size. Note that these are just for - // type consistency and will (likely) be eliminated in later phases. - SDValue TruncLHS = - DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); - SDValue TruncRHS = - DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); - - unsigned Opc; - if (Signed) { - Opc = NVPTXISD::MUL_WIDE_SIGNED; - } else { - Opc = NVPTXISD::MUL_WIDE_UNSIGNED; - } - - return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); -} - -/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. -static SDValue PerformMULCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - CodeGenOpt::Level OptLevel) { - if (OptLevel > 0) { - // Try mul.wide combining at OptLevel > 0 - if (SDValue Ret = TryMULWIDECombine(N, DCI)) - return Ret; - } - - return SDValue(); -} - -/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. -static SDValue PerformSHLCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - CodeGenOpt::Level OptLevel) { - if (OptLevel > 0) { - // Try mul.wide combining at OptLevel > 0 - if (SDValue Ret = TryMULWIDECombine(N, DCI)) - return Ret; - } - - return SDValue(); -} - -static SDValue PerformSETCCCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - EVT CCType = N->getValueType(0); - SDValue A = N->getOperand(0); - SDValue B = N->getOperand(1); - - if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) - return SDValue(); - - SDLoc DL(N); - // setp.f16x2 returns two scalar predicates, which we need to - // convert back to v2i1. The returned result will be scalarized by - // the legalizer, but the comparison will remain a single vector - // instruction. - SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, - DCI.DAG.getVTList(MVT::i1, MVT::i1), - {A, B, N->getOperand(2)}); - return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), - CCNode.getValue(1)); -} - -SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); - switch (N->getOpcode()) { - default: break; - case ISD::ADD: - case ISD::FADD: - return PerformADDCombine(N, DCI, STI, OptLevel); - case ISD::MUL: - return PerformMULCombine(N, DCI, OptLevel); - case ISD::SHL: - return PerformSHLCombine(N, DCI, OptLevel); - case ISD::AND: - return PerformANDCombine(N, DCI); - case ISD::UREM: - case ISD::SREM: - return PerformREMCombine(N, DCI, OptLevel); - case ISD::SETCC: - return PerformSETCCCombine(N, DCI); - } - return SDValue(); -} - -/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. -static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, - SmallVectorImpl &Results) { - EVT ResVT = N->getValueType(0); - SDLoc DL(N); - - assert(ResVT.isVector() && "Vector load must have vector type"); - - // We only handle "native" vector sizes for now, e.g. <4 x double> is not - // legal. We can (and should) split that into 2 loads of <2 x double> here - // but I'm leaving that as a TODO for now. - assert(ResVT.isSimple() && "Can only handle simple types"); - switch (ResVT.getSimpleVT().SimpleTy) { - default: - return; - case MVT::v2i8: - case MVT::v2i16: - case MVT::v2i32: - case MVT::v2i64: - case MVT::v2f16: - case MVT::v2f32: - case MVT::v2f64: - case MVT::v4i8: - case MVT::v4i16: - case MVT::v4i32: - case MVT::v4f16: - case MVT::v4f32: - case MVT::v8f16: // <4 x f16x2> - // This is a "native" vector type - break; - } - - LoadSDNode *LD = cast(N); - - unsigned Align = LD->getAlignment(); - auto &TD = DAG.getDataLayout(); - unsigned PrefAlign = - TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext())); - if (Align < PrefAlign) { - // This load is not sufficiently aligned, so bail out and let this vector - // load be scalarized. Note that we may still be able to emit smaller - // vector loads. For example, if we are loading a <4 x float> with an - // alignment of 8, this check will fail but the legalizer will try again - // with 2 x <2 x float>, which will succeed with an alignment of 8. - return; - } - - EVT EltVT = ResVT.getVectorElementType(); - unsigned NumElts = ResVT.getVectorNumElements(); - - // Since LoadV2 is a target node, we cannot rely on DAG type legalization. - // Therefore, we must ensure the type is legal. For i1 and i8, we set the - // loaded type to i16 and propagate the "real" type as the memory type. - bool NeedTrunc = false; - if (EltVT.getSizeInBits() < 16) { - EltVT = MVT::i16; - NeedTrunc = true; - } - - unsigned Opcode = 0; - SDVTList LdResVTs; - bool LoadF16x2 = false; - - switch (NumElts) { - default: - return; - case 2: - Opcode = NVPTXISD::LoadV2; - LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); - break; - case 4: { - Opcode = NVPTXISD::LoadV4; - EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; - LdResVTs = DAG.getVTList(ListVTs); - break; - } - case 8: { - // v8f16 is a special case. PTX doesn't have ld.v8.f16 - // instruction. Instead, we split the vector into v2f16 chunks and - // load them with ld.v4.b32. - assert(EltVT == MVT::f16 && "Unsupported v8 vector type."); - LoadF16x2 = true; - Opcode = NVPTXISD::LoadV4; - EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16, - MVT::Other}; - LdResVTs = DAG.getVTList(ListVTs); - break; - } - } - - // Copy regular operands - SmallVector OtherOps(N->op_begin(), N->op_end()); - - // The select routine does not have access to the LoadSDNode instance, so - // pass along the extension information - OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); - - SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, - LD->getMemoryVT(), - LD->getMemOperand()); - - SmallVector ScalarRes; - if (LoadF16x2) { - // Split v2f16 subvectors back into individual elements. - NumElts /= 2; - for (unsigned i = 0; i < NumElts; ++i) { - SDValue SubVector = NewLD.getValue(i); - SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, - DAG.getIntPtrConstant(0, DL)); - SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, - DAG.getIntPtrConstant(1, DL)); - ScalarRes.push_back(E0); - ScalarRes.push_back(E1); - } - } else { - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Res = NewLD.getValue(i); - if (NeedTrunc) - Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); - ScalarRes.push_back(Res); - } - } - - SDValue LoadChain = NewLD.getValue(NumElts); - - SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); - - Results.push_back(BuildVec); - Results.push_back(LoadChain); -} - -static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, - SmallVectorImpl &Results) { - SDValue Chain = N->getOperand(0); - SDValue Intrin = N->getOperand(1); - SDLoc DL(N); - - // Get the intrinsic ID - unsigned IntrinNo = cast(Intrin.getNode())->getZExtValue(); - switch (IntrinNo) { - default: - return; - case Intrinsic::nvvm_ldg_global_i: - case Intrinsic::nvvm_ldg_global_f: - case Intrinsic::nvvm_ldg_global_p: - case Intrinsic::nvvm_ldu_global_i: - case Intrinsic::nvvm_ldu_global_f: - case Intrinsic::nvvm_ldu_global_p: { - EVT ResVT = N->getValueType(0); - - if (ResVT.isVector()) { - // Vector LDG/LDU - - unsigned NumElts = ResVT.getVectorNumElements(); - EVT EltVT = ResVT.getVectorElementType(); - - // Since LDU/LDG are target nodes, we cannot rely on DAG type - // legalization. - // Therefore, we must ensure the type is legal. For i1 and i8, we set the - // loaded type to i16 and propagate the "real" type as the memory type. - bool NeedTrunc = false; - if (EltVT.getSizeInBits() < 16) { - EltVT = MVT::i16; - NeedTrunc = true; - } - - unsigned Opcode = 0; - SDVTList LdResVTs; - - switch (NumElts) { - default: - return; - case 2: - switch (IntrinNo) { - default: - return; - case Intrinsic::nvvm_ldg_global_i: - case Intrinsic::nvvm_ldg_global_f: - case Intrinsic::nvvm_ldg_global_p: - Opcode = NVPTXISD::LDGV2; - break; - case Intrinsic::nvvm_ldu_global_i: - case Intrinsic::nvvm_ldu_global_f: - case Intrinsic::nvvm_ldu_global_p: - Opcode = NVPTXISD::LDUV2; - break; - } - LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); - break; - case 4: { - switch (IntrinNo) { - default: - return; - case Intrinsic::nvvm_ldg_global_i: - case Intrinsic::nvvm_ldg_global_f: - case Intrinsic::nvvm_ldg_global_p: - Opcode = NVPTXISD::LDGV4; - break; - case Intrinsic::nvvm_ldu_global_i: - case Intrinsic::nvvm_ldu_global_f: - case Intrinsic::nvvm_ldu_global_p: - Opcode = NVPTXISD::LDUV4; - break; - } - EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; - LdResVTs = DAG.getVTList(ListVTs); - break; - } - } - - SmallVector OtherOps; - - // Copy regular operands - - OtherOps.push_back(Chain); // Chain - // Skip operand 1 (intrinsic ID) - // Others - OtherOps.append(N->op_begin() + 2, N->op_end()); - - MemIntrinsicSDNode *MemSD = cast(N); - - SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, - MemSD->getMemoryVT(), - MemSD->getMemOperand()); - - SmallVector ScalarRes; - - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Res = NewLD.getValue(i); - if (NeedTrunc) - Res = - DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); - ScalarRes.push_back(Res); - } - - SDValue LoadChain = NewLD.getValue(NumElts); - - SDValue BuildVec = - DAG.getBuildVector(ResVT, DL, ScalarRes); - - Results.push_back(BuildVec); - Results.push_back(LoadChain); - } else { - // i8 LDG/LDU - assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && - "Custom handling of non-i8 ldu/ldg?"); - - // Just copy all operands as-is - SmallVector Ops(N->op_begin(), N->op_end()); - - // Force output to i16 - SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); - - MemIntrinsicSDNode *MemSD = cast(N); - - // We make sure the memory type is i8, which will be used during isel - // to select the proper instruction. - SDValue NewLD = - DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, - MVT::i8, MemSD->getMemOperand()); - - Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, - NewLD.getValue(0))); - Results.push_back(NewLD.getValue(1)); - } - } - } -} - -void NVPTXTargetLowering::ReplaceNodeResults( - SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { - switch (N->getOpcode()) { - default: - report_fatal_error("Unhandled custom legalization"); - case ISD::LOAD: - ReplaceLoadVector(N, DAG, Results); - return; - case ISD::INTRINSIC_W_CHAIN: - ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); - return; - } -} - -// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file. -void NVPTXSection::anchor() {} - -NVPTXTargetObjectFile::~NVPTXTargetObjectFile() { - delete static_cast(TextSection); - delete static_cast(DataSection); - delete static_cast(BSSSection); - delete static_cast(ReadOnlySection); - - delete static_cast(StaticCtorSection); - delete static_cast(StaticDtorSection); - delete static_cast(LSDASection); - delete static_cast(EHFrameSection); - delete static_cast(DwarfAbbrevSection); - delete static_cast(DwarfInfoSection); - delete static_cast(DwarfLineSection); - delete static_cast(DwarfFrameSection); - delete static_cast(DwarfPubTypesSection); - delete static_cast(DwarfDebugInlineSection); - delete static_cast(DwarfStrSection); - delete static_cast(DwarfLocSection); - delete static_cast(DwarfARangesSection); - delete static_cast(DwarfRangesSection); - delete static_cast(DwarfMacinfoSection); -} - -MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( - const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { - return getDataSection(); -} +//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that NVPTX uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/NVPTXBaseInfo.h" +#include "NVPTX.h" +#include "NVPTXISelLowering.h" +#include "NVPTXSection.h" +#include "NVPTXSubtarget.h" +#include "NVPTXTargetMachine.h" +#include "NVPTXTargetObjectFile.h" +#include "NVPTXUtilities.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetCallingConv.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#undef DEBUG_TYPE +#define DEBUG_TYPE "nvptx-lower" + +using namespace llvm; + +static unsigned int uniqueCallSite = 0; + +static cl::opt sched4reg( + "nvptx-sched4reg", + cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); + +static cl::opt +FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, + cl::desc("NVPTX Specific: FMA contraction (0: don't do it" + " 1: do it 2: do it aggressively"), + cl::init(2)); + +static cl::opt UsePrecDivF32( + "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, + cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" + " IEEE Compliant F32 div.rnd if available."), + cl::init(2)); + +static cl::opt UsePrecSqrtF32( + "nvptx-prec-sqrtf32", cl::Hidden, + cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), + cl::init(true)); + +static cl::opt FtzEnabled( + "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden, + cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."), + cl::init(false)); + +int NVPTXTargetLowering::getDivF32Level() const { + if (UsePrecDivF32.getNumOccurrences() > 0) { + // If nvptx-prec-div32=N is used on the command-line, always honor it + return UsePrecDivF32; + } else { + // Otherwise, use div.approx if fast math is enabled + if (getTargetMachine().Options.UnsafeFPMath) + return 0; + else + return 2; + } +} + +bool NVPTXTargetLowering::usePrecSqrtF32() const { + if (UsePrecSqrtF32.getNumOccurrences() > 0) { + // If nvptx-prec-sqrtf32 is used on the command-line, always honor it + return UsePrecSqrtF32; + } else { + // Otherwise, use sqrt.approx if fast math is enabled + return !getTargetMachine().Options.UnsafeFPMath; + } +} + +bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { + // TODO: Get rid of this flag; there can be only one way to do this. + if (FtzEnabled.getNumOccurrences() > 0) { + // If nvptx-f32ftz is used on the command-line, always honor it + return FtzEnabled; + } else { + const Function *F = MF.getFunction(); + // Otherwise, check for an nvptx-f32ftz attribute on the function + if (F->hasFnAttribute("nvptx-f32ftz")) + return F->getFnAttribute("nvptx-f32ftz").getValueAsString() == "true"; + else + return false; + } +} + +static bool IsPTXVectorType(MVT VT) { + switch (VT.SimpleTy) { + default: + return false; + case MVT::v2i1: + case MVT::v4i1: + case MVT::v2i8: + case MVT::v4i8: + case MVT::v2i16: + case MVT::v4i16: + case MVT::v2i32: + case MVT::v4i32: + case MVT::v2i64: + case MVT::v2f16: + case MVT::v4f16: + case MVT::v8f16: // <4 x f16x2> + case MVT::v2f32: + case MVT::v4f32: + case MVT::v2f64: + return true; + } +} + +/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive +/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors +/// into their primitive components. +/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the +/// same number of types as the Ins/Outs arrays in LowerFormalArguments, +/// LowerCall, and LowerReturn. +static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, + Type *Ty, SmallVectorImpl &ValueVTs, + SmallVectorImpl *Offsets = nullptr, + uint64_t StartingOffset = 0) { + SmallVector TempVTs; + SmallVector TempOffsets; + + ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); + for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { + EVT VT = TempVTs[i]; + uint64_t Off = TempOffsets[i]; + // Split vectors into individual elements, except for v2f16, which + // we will pass as a single scalar. + if (VT.isVector()) { + unsigned NumElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + // Vectors with an even number of f16 elements will be passed to + // us as an array of v2f16 elements. We must match this so we + // stay in sync with Ins/Outs. + if (EltVT == MVT::f16 && NumElts % 2 == 0) { + EltVT = MVT::v2f16; + NumElts /= 2; + } + for (unsigned j = 0; j != NumElts; ++j) { + ValueVTs.push_back(EltVT); + if (Offsets) + Offsets->push_back(Off + j * EltVT.getStoreSize()); + } + } else { + ValueVTs.push_back(VT); + if (Offsets) + Offsets->push_back(Off); + } + } +} + +// Check whether we can merge loads/stores of some of the pieces of a +// flattened function parameter or return value into a single vector +// load/store. +// +// The flattened parameter is represented as a list of EVTs and +// offsets, and the whole structure is aligned to ParamAlignment. This +// function determines whether we can load/store pieces of the +// parameter starting at index Idx using a single vectorized op of +// size AccessSize. If so, it returns the number of param pieces +// covered by the vector op. Otherwise, it returns 1. +static unsigned CanMergeParamLoadStoresStartingAt( + unsigned Idx, uint32_t AccessSize, const SmallVectorImpl &ValueVTs, + const SmallVectorImpl &Offsets, unsigned ParamAlignment) { + assert(isPowerOf2_32(AccessSize) && "must be a power of 2!"); + + // Can't vectorize if param alignment is not sufficient. + if (AccessSize > ParamAlignment) + return 1; + // Can't vectorize if offset is not aligned. + if (Offsets[Idx] & (AccessSize - 1)) + return 1; + + EVT EltVT = ValueVTs[Idx]; + unsigned EltSize = EltVT.getStoreSize(); + + // Element is too large to vectorize. + if (EltSize >= AccessSize) + return 1; + + unsigned NumElts = AccessSize / EltSize; + // Can't vectorize if AccessBytes if not a multiple of EltSize. + if (AccessSize != EltSize * NumElts) + return 1; + + // We don't have enough elements to vectorize. + if (Idx + NumElts > ValueVTs.size()) + return 1; + + // PTX ISA can only deal with 2- and 4-element vector ops. + if (NumElts != 4 && NumElts != 2) + return 1; + + for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { + // Types do not match. + if (ValueVTs[j] != EltVT) + return 1; + + // Elements are not contiguous. + if (Offsets[j] - Offsets[j - 1] != EltSize) + return 1; + } + // OK. We can vectorize ValueVTs[i..i+NumElts) + return NumElts; +} + +// Flags for tracking per-element vectorization state of loads/stores +// of a flattened function parameter or return value. +enum ParamVectorizationFlags { + PVF_INNER = 0x0, // Middle elements of a vector. + PVF_FIRST = 0x1, // First element of the vector. + PVF_LAST = 0x2, // Last element of the vector. + // Scalar is effectively a 1-element vector. + PVF_SCALAR = PVF_FIRST | PVF_LAST +}; + +// Computes whether and how we can vectorize the loads/stores of a +// flattened function parameter or return value. +// +// The flattened parameter is represented as the list of ValueVTs and +// Offsets, and is aligned to ParamAlignment bytes. We return a vector +// of the same size as ValueVTs indicating how each piece should be +// loaded/stored (i.e. as a scalar, or as part of a vector +// load/store). +static SmallVector +VectorizePTXValueVTs(const SmallVectorImpl &ValueVTs, + const SmallVectorImpl &Offsets, + unsigned ParamAlignment) { + // Set vector size to match ValueVTs and mark all elements as + // scalars by default. + SmallVector VectorInfo; + VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); + + // Check what we can vectorize using 128/64/32-bit accesses. + for (int I = 0, E = ValueVTs.size(); I != E; ++I) { + // Skip elements we've already processed. + assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); + for (unsigned AccessSize : {16, 8, 4, 2}) { + unsigned NumElts = CanMergeParamLoadStoresStartingAt( + I, AccessSize, ValueVTs, Offsets, ParamAlignment); + // Mark vectorized elements. + switch (NumElts) { + default: + llvm_unreachable("Unexpected return value"); + case 1: + // Can't vectorize using this size, try next smaller size. + continue; + case 2: + assert(I + 1 < E && "Not enough elements."); + VectorInfo[I] = PVF_FIRST; + VectorInfo[I + 1] = PVF_LAST; + I += 1; + break; + case 4: + assert(I + 3 < E && "Not enough elements."); + VectorInfo[I] = PVF_FIRST; + VectorInfo[I + 1] = PVF_INNER; + VectorInfo[I + 2] = PVF_INNER; + VectorInfo[I + 3] = PVF_LAST; + I += 3; + break; + } + // Break out of the inner loop because we've already succeeded + // using largest possible AccessSize. + break; + } + } + return VectorInfo; +} + +// NVPTXTargetLowering Constructor. +NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, + const NVPTXSubtarget &STI) + : TargetLowering(TM), nvTM(&TM), STI(STI) { + // always lower memset, memcpy, and memmove intrinsics to load/store + // instructions, rather + // then generating calls to memset, mempcy or memmove. + MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; + + setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + // Jump is Expensive. Don't create extra control flow for 'and', 'or' + // condition branches. + setJumpIsExpensive(true); + + // Wide divides are _very_ slow. Try to reduce the width of the divide if + // possible. + addBypassSlowDiv(64, 32); + + // By default, use the Source scheduling + if (sched4reg) + setSchedulingPreference(Sched::RegPressure); + else + setSchedulingPreference(Sched::Source); + + auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, + LegalizeAction NoF16Action) { + setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); + }; + + addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); + addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); + addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); + addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); + addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); + addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); + addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass); + addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass); + + // Conversion to/from FP16/FP16x2 is always legal. + setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + + setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); + setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); + + // Operations not directly supported by NVPTX. + setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i8, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::BR_CC, MVT::f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v2f16, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BR_CC, MVT::f64, Expand); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::BR_CC, MVT::i8, Expand); + setOperationAction(ISD::BR_CC, MVT::i16, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + // Some SIGN_EXTEND_INREG can be done using cvt instruction. + // For others we will expand to a SHL/SRA pair. + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); + setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); + + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); + + if (STI.hasROT64()) { + setOperationAction(ISD::ROTL, MVT::i64, Legal); + setOperationAction(ISD::ROTR, MVT::i64, Legal); + } else { + setOperationAction(ISD::ROTL, MVT::i64, Expand); + setOperationAction(ISD::ROTR, MVT::i64, Expand); + } + if (STI.hasROT32()) { + setOperationAction(ISD::ROTL, MVT::i32, Legal); + setOperationAction(ISD::ROTR, MVT::i32, Legal); + } else { + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTR, MVT::i32, Expand); + } + + setOperationAction(ISD::ROTL, MVT::i16, Expand); + setOperationAction(ISD::ROTR, MVT::i16, Expand); + setOperationAction(ISD::ROTL, MVT::i8, Expand); + setOperationAction(ISD::ROTR, MVT::i8, Expand); + setOperationAction(ISD::BSWAP, MVT::i16, Expand); + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + setOperationAction(ISD::BSWAP, MVT::i64, Expand); + + // Indirect branch is not supported. + // This also disables Jump Table creation. + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + + // We want to legalize constant related memmove and memcopy + // intrinsics. + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + + // Turn FP extload into load/fpextend + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); + // Turn FP truncstore into trunc + store. + // FIXME: vector types should also be expanded + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + // PTX does not support load / store predicate registers + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::i1, Custom); + + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setTruncStoreAction(VT, MVT::i1, Expand); + } + + // This is legal in NVPTX + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f16, Legal); + + // TRAP can be lowered to PTX trap + setOperationAction(ISD::TRAP, MVT::Other, Legal); + + setOperationAction(ISD::ADDC, MVT::i64, Expand); + setOperationAction(ISD::ADDE, MVT::i64, Expand); + + // Register custom handling for vector loads/stores + for (MVT VT : MVT::vector_valuetypes()) { + if (IsPTXVectorType(VT)) { + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); + } + } + + // Custom handling for i8 intrinsics + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); + + for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { + setOperationAction(ISD::ABS, Ty, Legal); + setOperationAction(ISD::SMIN, Ty, Legal); + setOperationAction(ISD::SMAX, Ty, Legal); + setOperationAction(ISD::UMIN, Ty, Legal); + setOperationAction(ISD::UMAX, Ty, Legal); + + setOperationAction(ISD::CTPOP, Ty, Legal); + setOperationAction(ISD::CTLZ, Ty, Legal); + } + + setOperationAction(ISD::CTTZ, MVT::i16, Expand); + setOperationAction(ISD::CTTZ, MVT::i32, Expand); + setOperationAction(ISD::CTTZ, MVT::i64, Expand); + + // PTX does not directly support SELP of i1, so promote to i32 first + setOperationAction(ISD::SELECT, MVT::i1, Custom); + + // PTX cannot multiply two i64s in a single instruction. + setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); + + // We have some custom DAG combine patterns for these nodes + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SREM); + setTargetDAGCombine(ISD::UREM); + + // setcc for f16x2 needs special handling to prevent legalizer's + // attempt to scalarize it due to v2i1 not being legal. + if (STI.allowFP16Math()) + setTargetDAGCombine(ISD::SETCC); + + // Promote fp16 arithmetic if fp16 hardware isn't available or the + // user passed --nvptx-no-fp16-math. The flag is useful because, + // although sm_53+ GPUs have some sort of FP16 support in + // hardware, only sm_53 and sm_60 have full implementation. Others + // only have token amount of hardware and are likely to run faster + // by using fp32 units instead. + for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { + setFP16OperationAction(Op, MVT::f16, Legal, Promote); + setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); + } + + // There's no neg.f16 instruction. Expand to (0-x). + setOperationAction(ISD::FNEG, MVT::f16, Expand); + setOperationAction(ISD::FNEG, MVT::v2f16, Expand); + + // (would be) Library functions. + + // These map to conversion instructions for scalar FP types. + for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, + ISD::FROUND, ISD::FTRUNC}) { + setOperationAction(Op, MVT::f16, Legal); + setOperationAction(Op, MVT::f32, Legal); + setOperationAction(Op, MVT::f64, Legal); + setOperationAction(Op, MVT::v2f16, Expand); + } + + // 'Expand' implements FCOPYSIGN without calling an external library. + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + + // These map to corresponding instructions for f32/f64. f16 must be + // promoted to f32. v2f16 is expanded to f16, which is then promoted + // to f32. + for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, + ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) { + setOperationAction(Op, MVT::f16, Promote); + setOperationAction(Op, MVT::f32, Legal); + setOperationAction(Op, MVT::f64, Legal); + setOperationAction(Op, MVT::v2f16, Expand); + } + setOperationAction(ISD::FMINNUM, MVT::f16, Promote); + setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::FMINNAN, MVT::f16, Promote); + setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); + + // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. + // No FPOW or FREM in PTX. + + // Now deduce the information based on the above mentioned + // actions + computeRegisterProperties(STI.getRegisterInfo()); +} + +const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch ((NVPTXISD::NodeType)Opcode) { + case NVPTXISD::FIRST_NUMBER: + break; + case NVPTXISD::CALL: + return "NVPTXISD::CALL"; + case NVPTXISD::RET_FLAG: + return "NVPTXISD::RET_FLAG"; + case NVPTXISD::LOAD_PARAM: + return "NVPTXISD::LOAD_PARAM"; + case NVPTXISD::Wrapper: + return "NVPTXISD::Wrapper"; + case NVPTXISD::DeclareParam: + return "NVPTXISD::DeclareParam"; + case NVPTXISD::DeclareScalarParam: + return "NVPTXISD::DeclareScalarParam"; + case NVPTXISD::DeclareRet: + return "NVPTXISD::DeclareRet"; + case NVPTXISD::DeclareScalarRet: + return "NVPTXISD::DeclareScalarRet"; + case NVPTXISD::DeclareRetParam: + return "NVPTXISD::DeclareRetParam"; + case NVPTXISD::PrintCall: + return "NVPTXISD::PrintCall"; + case NVPTXISD::PrintConvergentCall: + return "NVPTXISD::PrintConvergentCall"; + case NVPTXISD::PrintCallUni: + return "NVPTXISD::PrintCallUni"; + case NVPTXISD::PrintConvergentCallUni: + return "NVPTXISD::PrintConvergentCallUni"; + case NVPTXISD::LoadParam: + return "NVPTXISD::LoadParam"; + case NVPTXISD::LoadParamV2: + return "NVPTXISD::LoadParamV2"; + case NVPTXISD::LoadParamV4: + return "NVPTXISD::LoadParamV4"; + case NVPTXISD::StoreParam: + return "NVPTXISD::StoreParam"; + case NVPTXISD::StoreParamV2: + return "NVPTXISD::StoreParamV2"; + case NVPTXISD::StoreParamV4: + return "NVPTXISD::StoreParamV4"; + case NVPTXISD::StoreParamS32: + return "NVPTXISD::StoreParamS32"; + case NVPTXISD::StoreParamU32: + return "NVPTXISD::StoreParamU32"; + case NVPTXISD::CallArgBegin: + return "NVPTXISD::CallArgBegin"; + case NVPTXISD::CallArg: + return "NVPTXISD::CallArg"; + case NVPTXISD::LastCallArg: + return "NVPTXISD::LastCallArg"; + case NVPTXISD::CallArgEnd: + return "NVPTXISD::CallArgEnd"; + case NVPTXISD::CallVoid: + return "NVPTXISD::CallVoid"; + case NVPTXISD::CallVal: + return "NVPTXISD::CallVal"; + case NVPTXISD::CallSymbol: + return "NVPTXISD::CallSymbol"; + case NVPTXISD::Prototype: + return "NVPTXISD::Prototype"; + case NVPTXISD::MoveParam: + return "NVPTXISD::MoveParam"; + case NVPTXISD::StoreRetval: + return "NVPTXISD::StoreRetval"; + case NVPTXISD::StoreRetvalV2: + return "NVPTXISD::StoreRetvalV2"; + case NVPTXISD::StoreRetvalV4: + return "NVPTXISD::StoreRetvalV4"; + case NVPTXISD::PseudoUseParam: + return "NVPTXISD::PseudoUseParam"; + case NVPTXISD::RETURN: + return "NVPTXISD::RETURN"; + case NVPTXISD::CallSeqBegin: + return "NVPTXISD::CallSeqBegin"; + case NVPTXISD::CallSeqEnd: + return "NVPTXISD::CallSeqEnd"; + case NVPTXISD::CallPrototype: + return "NVPTXISD::CallPrototype"; + case NVPTXISD::LoadV2: + return "NVPTXISD::LoadV2"; + case NVPTXISD::LoadV4: + return "NVPTXISD::LoadV4"; + case NVPTXISD::LDGV2: + return "NVPTXISD::LDGV2"; + case NVPTXISD::LDGV4: + return "NVPTXISD::LDGV4"; + case NVPTXISD::LDUV2: + return "NVPTXISD::LDUV2"; + case NVPTXISD::LDUV4: + return "NVPTXISD::LDUV4"; + case NVPTXISD::StoreV2: + return "NVPTXISD::StoreV2"; + case NVPTXISD::StoreV4: + return "NVPTXISD::StoreV4"; + case NVPTXISD::FUN_SHFL_CLAMP: + return "NVPTXISD::FUN_SHFL_CLAMP"; + case NVPTXISD::FUN_SHFR_CLAMP: + return "NVPTXISD::FUN_SHFR_CLAMP"; + case NVPTXISD::IMAD: + return "NVPTXISD::IMAD"; + case NVPTXISD::SETP_F16X2: + return "NVPTXISD::SETP_F16X2"; + case NVPTXISD::Dummy: + return "NVPTXISD::Dummy"; + case NVPTXISD::MUL_WIDE_SIGNED: + return "NVPTXISD::MUL_WIDE_SIGNED"; + case NVPTXISD::MUL_WIDE_UNSIGNED: + return "NVPTXISD::MUL_WIDE_UNSIGNED"; + case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; + case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; + case NVPTXISD::Tex1DFloatFloatLevel: + return "NVPTXISD::Tex1DFloatFloatLevel"; + case NVPTXISD::Tex1DFloatFloatGrad: + return "NVPTXISD::Tex1DFloatFloatGrad"; + case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; + case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; + case NVPTXISD::Tex1DS32FloatLevel: + return "NVPTXISD::Tex1DS32FloatLevel"; + case NVPTXISD::Tex1DS32FloatGrad: + return "NVPTXISD::Tex1DS32FloatGrad"; + case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; + case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; + case NVPTXISD::Tex1DU32FloatLevel: + return "NVPTXISD::Tex1DU32FloatLevel"; + case NVPTXISD::Tex1DU32FloatGrad: + return "NVPTXISD::Tex1DU32FloatGrad"; + case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; + case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; + case NVPTXISD::Tex1DArrayFloatFloatLevel: + return "NVPTXISD::Tex1DArrayFloatFloatLevel"; + case NVPTXISD::Tex1DArrayFloatFloatGrad: + return "NVPTXISD::Tex1DArrayFloatFloatGrad"; + case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; + case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; + case NVPTXISD::Tex1DArrayS32FloatLevel: + return "NVPTXISD::Tex1DArrayS32FloatLevel"; + case NVPTXISD::Tex1DArrayS32FloatGrad: + return "NVPTXISD::Tex1DArrayS32FloatGrad"; + case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; + case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; + case NVPTXISD::Tex1DArrayU32FloatLevel: + return "NVPTXISD::Tex1DArrayU32FloatLevel"; + case NVPTXISD::Tex1DArrayU32FloatGrad: + return "NVPTXISD::Tex1DArrayU32FloatGrad"; + case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; + case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; + case NVPTXISD::Tex2DFloatFloatLevel: + return "NVPTXISD::Tex2DFloatFloatLevel"; + case NVPTXISD::Tex2DFloatFloatGrad: + return "NVPTXISD::Tex2DFloatFloatGrad"; + case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; + case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; + case NVPTXISD::Tex2DS32FloatLevel: + return "NVPTXISD::Tex2DS32FloatLevel"; + case NVPTXISD::Tex2DS32FloatGrad: + return "NVPTXISD::Tex2DS32FloatGrad"; + case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; + case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; + case NVPTXISD::Tex2DU32FloatLevel: + return "NVPTXISD::Tex2DU32FloatLevel"; + case NVPTXISD::Tex2DU32FloatGrad: + return "NVPTXISD::Tex2DU32FloatGrad"; + case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; + case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; + case NVPTXISD::Tex2DArrayFloatFloatLevel: + return "NVPTXISD::Tex2DArrayFloatFloatLevel"; + case NVPTXISD::Tex2DArrayFloatFloatGrad: + return "NVPTXISD::Tex2DArrayFloatFloatGrad"; + case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; + case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; + case NVPTXISD::Tex2DArrayS32FloatLevel: + return "NVPTXISD::Tex2DArrayS32FloatLevel"; + case NVPTXISD::Tex2DArrayS32FloatGrad: + return "NVPTXISD::Tex2DArrayS32FloatGrad"; + case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; + case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; + case NVPTXISD::Tex2DArrayU32FloatLevel: + return "NVPTXISD::Tex2DArrayU32FloatLevel"; + case NVPTXISD::Tex2DArrayU32FloatGrad: + return "NVPTXISD::Tex2DArrayU32FloatGrad"; + case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; + case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; + case NVPTXISD::Tex3DFloatFloatLevel: + return "NVPTXISD::Tex3DFloatFloatLevel"; + case NVPTXISD::Tex3DFloatFloatGrad: + return "NVPTXISD::Tex3DFloatFloatGrad"; + case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; + case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; + case NVPTXISD::Tex3DS32FloatLevel: + return "NVPTXISD::Tex3DS32FloatLevel"; + case NVPTXISD::Tex3DS32FloatGrad: + return "NVPTXISD::Tex3DS32FloatGrad"; + case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; + case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; + case NVPTXISD::Tex3DU32FloatLevel: + return "NVPTXISD::Tex3DU32FloatLevel"; + case NVPTXISD::Tex3DU32FloatGrad: + return "NVPTXISD::Tex3DU32FloatGrad"; + case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; + case NVPTXISD::TexCubeFloatFloatLevel: + return "NVPTXISD::TexCubeFloatFloatLevel"; + case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; + case NVPTXISD::TexCubeS32FloatLevel: + return "NVPTXISD::TexCubeS32FloatLevel"; + case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; + case NVPTXISD::TexCubeU32FloatLevel: + return "NVPTXISD::TexCubeU32FloatLevel"; + case NVPTXISD::TexCubeArrayFloatFloat: + return "NVPTXISD::TexCubeArrayFloatFloat"; + case NVPTXISD::TexCubeArrayFloatFloatLevel: + return "NVPTXISD::TexCubeArrayFloatFloatLevel"; + case NVPTXISD::TexCubeArrayS32Float: + return "NVPTXISD::TexCubeArrayS32Float"; + case NVPTXISD::TexCubeArrayS32FloatLevel: + return "NVPTXISD::TexCubeArrayS32FloatLevel"; + case NVPTXISD::TexCubeArrayU32Float: + return "NVPTXISD::TexCubeArrayU32Float"; + case NVPTXISD::TexCubeArrayU32FloatLevel: + return "NVPTXISD::TexCubeArrayU32FloatLevel"; + case NVPTXISD::Tld4R2DFloatFloat: + return "NVPTXISD::Tld4R2DFloatFloat"; + case NVPTXISD::Tld4G2DFloatFloat: + return "NVPTXISD::Tld4G2DFloatFloat"; + case NVPTXISD::Tld4B2DFloatFloat: + return "NVPTXISD::Tld4B2DFloatFloat"; + case NVPTXISD::Tld4A2DFloatFloat: + return "NVPTXISD::Tld4A2DFloatFloat"; + case NVPTXISD::Tld4R2DS64Float: + return "NVPTXISD::Tld4R2DS64Float"; + case NVPTXISD::Tld4G2DS64Float: + return "NVPTXISD::Tld4G2DS64Float"; + case NVPTXISD::Tld4B2DS64Float: + return "NVPTXISD::Tld4B2DS64Float"; + case NVPTXISD::Tld4A2DS64Float: + return "NVPTXISD::Tld4A2DS64Float"; + case NVPTXISD::Tld4R2DU64Float: + return "NVPTXISD::Tld4R2DU64Float"; + case NVPTXISD::Tld4G2DU64Float: + return "NVPTXISD::Tld4G2DU64Float"; + case NVPTXISD::Tld4B2DU64Float: + return "NVPTXISD::Tld4B2DU64Float"; + case NVPTXISD::Tld4A2DU64Float: + return "NVPTXISD::Tld4A2DU64Float"; + + case NVPTXISD::TexUnified1DFloatS32: + return "NVPTXISD::TexUnified1DFloatS32"; + case NVPTXISD::TexUnified1DFloatFloat: + return "NVPTXISD::TexUnified1DFloatFloat"; + case NVPTXISD::TexUnified1DFloatFloatLevel: + return "NVPTXISD::TexUnified1DFloatFloatLevel"; + case NVPTXISD::TexUnified1DFloatFloatGrad: + return "NVPTXISD::TexUnified1DFloatFloatGrad"; + case NVPTXISD::TexUnified1DS32S32: + return "NVPTXISD::TexUnified1DS32S32"; + case NVPTXISD::TexUnified1DS32Float: + return "NVPTXISD::TexUnified1DS32Float"; + case NVPTXISD::TexUnified1DS32FloatLevel: + return "NVPTXISD::TexUnified1DS32FloatLevel"; + case NVPTXISD::TexUnified1DS32FloatGrad: + return "NVPTXISD::TexUnified1DS32FloatGrad"; + case NVPTXISD::TexUnified1DU32S32: + return "NVPTXISD::TexUnified1DU32S32"; + case NVPTXISD::TexUnified1DU32Float: + return "NVPTXISD::TexUnified1DU32Float"; + case NVPTXISD::TexUnified1DU32FloatLevel: + return "NVPTXISD::TexUnified1DU32FloatLevel"; + case NVPTXISD::TexUnified1DU32FloatGrad: + return "NVPTXISD::TexUnified1DU32FloatGrad"; + case NVPTXISD::TexUnified1DArrayFloatS32: + return "NVPTXISD::TexUnified1DArrayFloatS32"; + case NVPTXISD::TexUnified1DArrayFloatFloat: + return "NVPTXISD::TexUnified1DArrayFloatFloat"; + case NVPTXISD::TexUnified1DArrayFloatFloatLevel: + return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; + case NVPTXISD::TexUnified1DArrayFloatFloatGrad: + return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; + case NVPTXISD::TexUnified1DArrayS32S32: + return "NVPTXISD::TexUnified1DArrayS32S32"; + case NVPTXISD::TexUnified1DArrayS32Float: + return "NVPTXISD::TexUnified1DArrayS32Float"; + case NVPTXISD::TexUnified1DArrayS32FloatLevel: + return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; + case NVPTXISD::TexUnified1DArrayS32FloatGrad: + return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; + case NVPTXISD::TexUnified1DArrayU32S32: + return "NVPTXISD::TexUnified1DArrayU32S32"; + case NVPTXISD::TexUnified1DArrayU32Float: + return "NVPTXISD::TexUnified1DArrayU32Float"; + case NVPTXISD::TexUnified1DArrayU32FloatLevel: + return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; + case NVPTXISD::TexUnified1DArrayU32FloatGrad: + return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; + case NVPTXISD::TexUnified2DFloatS32: + return "NVPTXISD::TexUnified2DFloatS32"; + case NVPTXISD::TexUnified2DFloatFloat: + return "NVPTXISD::TexUnified2DFloatFloat"; + case NVPTXISD::TexUnified2DFloatFloatLevel: + return "NVPTXISD::TexUnified2DFloatFloatLevel"; + case NVPTXISD::TexUnified2DFloatFloatGrad: + return "NVPTXISD::TexUnified2DFloatFloatGrad"; + case NVPTXISD::TexUnified2DS32S32: + return "NVPTXISD::TexUnified2DS32S32"; + case NVPTXISD::TexUnified2DS32Float: + return "NVPTXISD::TexUnified2DS32Float"; + case NVPTXISD::TexUnified2DS32FloatLevel: + return "NVPTXISD::TexUnified2DS32FloatLevel"; + case NVPTXISD::TexUnified2DS32FloatGrad: + return "NVPTXISD::TexUnified2DS32FloatGrad"; + case NVPTXISD::TexUnified2DU32S32: + return "NVPTXISD::TexUnified2DU32S32"; + case NVPTXISD::TexUnified2DU32Float: + return "NVPTXISD::TexUnified2DU32Float"; + case NVPTXISD::TexUnified2DU32FloatLevel: + return "NVPTXISD::TexUnified2DU32FloatLevel"; + case NVPTXISD::TexUnified2DU32FloatGrad: + return "NVPTXISD::TexUnified2DU32FloatGrad"; + case NVPTXISD::TexUnified2DArrayFloatS32: + return "NVPTXISD::TexUnified2DArrayFloatS32"; + case NVPTXISD::TexUnified2DArrayFloatFloat: + return "NVPTXISD::TexUnified2DArrayFloatFloat"; + case NVPTXISD::TexUnified2DArrayFloatFloatLevel: + return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; + case NVPTXISD::TexUnified2DArrayFloatFloatGrad: + return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; + case NVPTXISD::TexUnified2DArrayS32S32: + return "NVPTXISD::TexUnified2DArrayS32S32"; + case NVPTXISD::TexUnified2DArrayS32Float: + return "NVPTXISD::TexUnified2DArrayS32Float"; + case NVPTXISD::TexUnified2DArrayS32FloatLevel: + return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; + case NVPTXISD::TexUnified2DArrayS32FloatGrad: + return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; + case NVPTXISD::TexUnified2DArrayU32S32: + return "NVPTXISD::TexUnified2DArrayU32S32"; + case NVPTXISD::TexUnified2DArrayU32Float: + return "NVPTXISD::TexUnified2DArrayU32Float"; + case NVPTXISD::TexUnified2DArrayU32FloatLevel: + return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; + case NVPTXISD::TexUnified2DArrayU32FloatGrad: + return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; + case NVPTXISD::TexUnified3DFloatS32: + return "NVPTXISD::TexUnified3DFloatS32"; + case NVPTXISD::TexUnified3DFloatFloat: + return "NVPTXISD::TexUnified3DFloatFloat"; + case NVPTXISD::TexUnified3DFloatFloatLevel: + return "NVPTXISD::TexUnified3DFloatFloatLevel"; + case NVPTXISD::TexUnified3DFloatFloatGrad: + return "NVPTXISD::TexUnified3DFloatFloatGrad"; + case NVPTXISD::TexUnified3DS32S32: + return "NVPTXISD::TexUnified3DS32S32"; + case NVPTXISD::TexUnified3DS32Float: + return "NVPTXISD::TexUnified3DS32Float"; + case NVPTXISD::TexUnified3DS32FloatLevel: + return "NVPTXISD::TexUnified3DS32FloatLevel"; + case NVPTXISD::TexUnified3DS32FloatGrad: + return "NVPTXISD::TexUnified3DS32FloatGrad"; + case NVPTXISD::TexUnified3DU32S32: + return "NVPTXISD::TexUnified3DU32S32"; + case NVPTXISD::TexUnified3DU32Float: + return "NVPTXISD::TexUnified3DU32Float"; + case NVPTXISD::TexUnified3DU32FloatLevel: + return "NVPTXISD::TexUnified3DU32FloatLevel"; + case NVPTXISD::TexUnified3DU32FloatGrad: + return "NVPTXISD::TexUnified3DU32FloatGrad"; + case NVPTXISD::TexUnifiedCubeFloatFloat: + return "NVPTXISD::TexUnifiedCubeFloatFloat"; + case NVPTXISD::TexUnifiedCubeFloatFloatLevel: + return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; + case NVPTXISD::TexUnifiedCubeS32Float: + return "NVPTXISD::TexUnifiedCubeS32Float"; + case NVPTXISD::TexUnifiedCubeS32FloatLevel: + return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; + case NVPTXISD::TexUnifiedCubeU32Float: + return "NVPTXISD::TexUnifiedCubeU32Float"; + case NVPTXISD::TexUnifiedCubeU32FloatLevel: + return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayFloatFloat: + return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; + case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayS32Float: + return "NVPTXISD::TexUnifiedCubeArrayS32Float"; + case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; + case NVPTXISD::TexUnifiedCubeArrayU32Float: + return "NVPTXISD::TexUnifiedCubeArrayU32Float"; + case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: + return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; + case NVPTXISD::Tld4UnifiedR2DFloatFloat: + return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; + case NVPTXISD::Tld4UnifiedG2DFloatFloat: + return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; + case NVPTXISD::Tld4UnifiedB2DFloatFloat: + return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; + case NVPTXISD::Tld4UnifiedA2DFloatFloat: + return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; + case NVPTXISD::Tld4UnifiedR2DS64Float: + return "NVPTXISD::Tld4UnifiedR2DS64Float"; + case NVPTXISD::Tld4UnifiedG2DS64Float: + return "NVPTXISD::Tld4UnifiedG2DS64Float"; + case NVPTXISD::Tld4UnifiedB2DS64Float: + return "NVPTXISD::Tld4UnifiedB2DS64Float"; + case NVPTXISD::Tld4UnifiedA2DS64Float: + return "NVPTXISD::Tld4UnifiedA2DS64Float"; + case NVPTXISD::Tld4UnifiedR2DU64Float: + return "NVPTXISD::Tld4UnifiedR2DU64Float"; + case NVPTXISD::Tld4UnifiedG2DU64Float: + return "NVPTXISD::Tld4UnifiedG2DU64Float"; + case NVPTXISD::Tld4UnifiedB2DU64Float: + return "NVPTXISD::Tld4UnifiedB2DU64Float"; + case NVPTXISD::Tld4UnifiedA2DU64Float: + return "NVPTXISD::Tld4UnifiedA2DU64Float"; + + case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; + case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; + case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; + case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; + case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; + case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; + case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; + case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; + case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; + case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; + case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; + + case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; + case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; + case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; + case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; + case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; + case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; + case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; + case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; + case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; + case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; + case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; + + case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; + case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; + case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; + case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; + case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; + case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; + case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; + case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; + case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; + case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; + case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; + + case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; + case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; + case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; + case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; + case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; + case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; + case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; + case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; + case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; + case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; + case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; + + case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; + case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; + case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; + case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; + case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; + case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; + case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; + case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; + case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; + case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; + case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; + + case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; + case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; + case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; + case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; + case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; + case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; + case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; + case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; + case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; + case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; + case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; + + case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; + case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; + case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; + case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; + case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; + case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; + case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; + case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; + case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; + case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; + case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; + + case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; + case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; + case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; + case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; + case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; + case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; + case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; + case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; + case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; + case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; + case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; + + case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; + case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; + case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; + case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; + case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; + case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; + case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; + case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; + case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; + case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; + case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; + + case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; + case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; + case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; + case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; + case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; + case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; + case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; + case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; + case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; + case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; + case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; + + case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; + case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; + case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; + case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; + case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; + case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; + case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; + case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; + case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; + case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; + case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; + + case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; + case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; + case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; + case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; + case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; + case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; + case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; + case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; + case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; + case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; + case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; + + case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; + case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; + case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; + case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; + case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; + case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; + case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; + case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; + case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; + case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; + case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; + + case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; + case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; + case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; + case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; + case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; + case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; + case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; + case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; + case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; + case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; + case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; + + case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; + case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; + case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; + case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; + case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; + case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; + case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; + case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; + case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; + case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; + case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; + } + return nullptr; +} + +TargetLoweringBase::LegalizeTypeAction +NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) + return TypeSplitVector; + if (VT == MVT::v2f16) + return TypeLegal; + return TargetLoweringBase::getPreferredVectorAction(VT); +} + +SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, + int Enabled, int &ExtraSteps, + bool &UseOneConst, + bool Reciprocal) const { + if (!(Enabled == ReciprocalEstimate::Enabled || + (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) + return SDValue(); + + if (ExtraSteps == ReciprocalEstimate::Unspecified) + ExtraSteps = 0; + + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); + bool Ftz = useF32FTZ(DAG.getMachineFunction()); + + auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(IID, DL, MVT::i32), Operand); + }; + + // The sqrt and rsqrt refinement processes assume we always start out with an + // approximation of the rsqrt. Therefore, if we're going to do any refinement + // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing + // any refinement, we must return a regular sqrt. + if (Reciprocal || ExtraSteps > 0) { + if (VT == MVT::f32) + return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f + : Intrinsic::nvvm_rsqrt_approx_f); + else if (VT == MVT::f64) + return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); + else + return SDValue(); + } else { + if (VT == MVT::f32) + return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f + : Intrinsic::nvvm_sqrt_approx_f); + else { + // There's no sqrt.approx.f64 instruction, so we emit + // reciprocal(rsqrt(x)). This is faster than + // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain + // x * rsqrt(x).) + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), + MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); + } + } +} + +SDValue +NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + const GlobalValue *GV = cast(Op)->getGlobal(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT); + return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); +} + +std::string NVPTXTargetLowering::getPrototype( + const DataLayout &DL, Type *retTy, const ArgListTy &Args, + const SmallVectorImpl &Outs, unsigned retAlignment, + const ImmutableCallSite *CS) const { + auto PtrVT = getPointerTy(DL); + + bool isABI = (STI.getSmVersion() >= 20); + assert(isABI && "Non-ABI compilation is not supported"); + if (!isABI) + return ""; + + std::stringstream O; + O << "prototype_" << uniqueCallSite << " : .callprototype "; + + if (retTy->getTypeID() == Type::VoidTyID) { + O << "()"; + } else { + O << "("; + if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) { + unsigned size = 0; + if (auto *ITy = dyn_cast(retTy)) { + size = ITy->getBitWidth(); + } else { + assert(retTy->isFloatingPointTy() && + "Floating point type expected here"); + size = retTy->getPrimitiveSizeInBits(); + } + // PTX ABI requires all scalar return values to be at least 32 + // bits in size. fp16 normally uses .b16 as its storage type in + // PTX, so its size must be adjusted here, too. + if (size < 32) + size = 32; + + O << ".param .b" << size << " _"; + } else if (isa(retTy)) { + O << ".param .b" << PtrVT.getSizeInBits() << " _"; + } else if (retTy->isAggregateType() || retTy->isVectorTy()) { + auto &DL = CS->getCalledFunction()->getParent()->getDataLayout(); + O << ".param .align " << retAlignment << " .b8 _[" + << DL.getTypeAllocSize(retTy) << "]"; + } else { + llvm_unreachable("Unknown return type"); + } + O << ") "; + } + O << "_ ("; + + bool first = true; + + unsigned OIdx = 0; + for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { + Type *Ty = Args[i].Ty; + if (!first) { + O << ", "; + } + first = false; + + if (!Outs[OIdx].Flags.isByVal()) { + if (Ty->isAggregateType() || Ty->isVectorTy()) { + unsigned align = 0; + const CallInst *CallI = cast(CS->getInstruction()); + // +1 because index 0 is reserved for return type alignment + if (!getAlign(*CallI, i + 1, align)) + align = DL.getABITypeAlignment(Ty); + unsigned sz = DL.getTypeAllocSize(Ty); + O << ".param .align " << align << " .b8 "; + O << "_"; + O << "[" << sz << "]"; + // update the index for Outs + SmallVector vtparts; + ComputeValueVTs(*this, DL, Ty, vtparts); + if (unsigned len = vtparts.size()) + OIdx += len - 1; + continue; + } + // i8 types in IR will be i16 types in SDAG + assert((getValueType(DL, Ty) == Outs[OIdx].VT || + (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && + "type mismatch between callee prototype and arguments"); + // scalar type + unsigned sz = 0; + if (isa(Ty)) { + sz = cast(Ty)->getBitWidth(); + if (sz < 32) + sz = 32; + } else if (isa(Ty)) { + sz = PtrVT.getSizeInBits(); + } else if (Ty->isHalfTy()) + // PTX ABI requires all scalar parameters to be at least 32 + // bits in size. fp16 normally uses .b16 as its storage type + // in PTX, so its size must be adjusted here, too. + sz = 32; + else + sz = Ty->getPrimitiveSizeInBits(); + O << ".param .b" << sz << " "; + O << "_"; + continue; + } + auto *PTy = dyn_cast(Ty); + assert(PTy && "Param with byval attribute should be a pointer type"); + Type *ETy = PTy->getElementType(); + + unsigned align = Outs[OIdx].Flags.getByValAlign(); + unsigned sz = DL.getTypeAllocSize(ETy); + O << ".param .align " << align << " .b8 "; + O << "_"; + O << "[" << sz << "]"; + } + O << ");"; + return O.str(); +} + +unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, + const ImmutableCallSite *CS, + Type *Ty, unsigned Idx, + const DataLayout &DL) const { + if (!CS) { + // CallSite is zero, fallback to ABI type alignment + return DL.getABITypeAlignment(Ty); + } + + unsigned Align = 0; + const Value *DirectCallee = CS->getCalledFunction(); + + if (!DirectCallee) { + // We don't have a direct function symbol, but that may be because of + // constant cast instructions in the call. + const Instruction *CalleeI = CS->getInstruction(); + assert(CalleeI && "Call target is not a function or derived value?"); + + // With bitcast'd call targets, the instruction will be the call + if (isa(CalleeI)) { + // Check if we have call alignment metadata + if (getAlign(*cast(CalleeI), Idx, Align)) + return Align; + + const Value *CalleeV = cast(CalleeI)->getCalledValue(); + // Ignore any bitcast instructions + while (isa(CalleeV)) { + const ConstantExpr *CE = cast(CalleeV); + if (!CE->isCast()) + break; + // Look through the bitcast + CalleeV = cast(CalleeV)->getOperand(0); + } + + // We have now looked past all of the bitcasts. Do we finally have a + // Function? + if (isa(CalleeV)) + DirectCallee = CalleeV; + } + } + + // Check for function alignment information if we found that the + // ultimate target is a Function + if (DirectCallee) + if (getAlign(*cast(DirectCallee), Idx, Align)) + return Align; + + // Call is indirect or alignment information is not available, fall back to + // the ABI type alignment + return DL.getABITypeAlignment(Ty); +} + +SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc dl = CLI.DL; + SmallVectorImpl &Outs = CLI.Outs; + SmallVectorImpl &OutVals = CLI.OutVals; + SmallVectorImpl &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + ArgListTy &Args = CLI.getArgs(); + Type *RetTy = CLI.RetTy; + ImmutableCallSite *CS = CLI.CS; + const DataLayout &DL = DAG.getDataLayout(); + + bool isABI = (STI.getSmVersion() >= 20); + assert(isABI && "Non-ABI compilation is not supported"); + if (!isABI) + return Chain; + + SDValue tempChain = Chain; + Chain = DAG.getCALLSEQ_START( + Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl); + SDValue InFlag = Chain.getValue(1); + + unsigned paramCount = 0; + // Args.size() and Outs.size() need not match. + // Outs.size() will be larger + // * if there is an aggregate argument with multiple fields (each field + // showing up separately in Outs) + // * if there is a vector argument with more than typical vector-length + // elements (generally if more than 4) where each vector element is + // individually present in Outs. + // So a different index should be used for indexing into Outs/OutVals. + // See similar issue in LowerFormalArguments. + unsigned OIdx = 0; + // Declare the .params or .reg need to pass values + // to the function + for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { + EVT VT = Outs[OIdx].VT; + Type *Ty = Args[i].Ty; + + if (!Outs[OIdx].Flags.isByVal()) { + SmallVector VTs; + SmallVector Offsets; + ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets); + unsigned ArgAlign = + getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); + unsigned AllocSize = DL.getTypeAllocSize(Ty); + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + bool NeedAlign; // Does argument declaration specify alignment? + if (Ty->isAggregateType() || Ty->isVectorTy()) { + // declare .param .align .b8 .param[]; + SDValue DeclareParamOps[] = { + Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), + DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(AllocSize, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, + DeclareParamOps); + NeedAlign = true; + } else { + // declare .param .b .param; + if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) { + // PTX ABI requires integral types to be at least 32 bits in + // size. FP16 is loaded/stored using i16, so it's handled + // here as well. + AllocSize = 4; + } + SDValue DeclareScalarParamOps[] = { + Chain, DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(AllocSize * 8, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, + DeclareScalarParamOps); + NeedAlign = false; + } + InFlag = Chain.getValue(1); + + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter + // than 32-bits are sign extended or zero extended, depending on + // whether they are signed or unsigned types. This case applies + // only to scalar parameters and not to aggregate values. + bool ExtendIntegerParam = + Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; + + auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); + SmallVector StoreOperands; + for (unsigned j = 0, je = VTs.size(); j != je; ++j) { + // New store. + if (VectorInfo[j] & PVF_FIRST) { + assert(StoreOperands.empty() && "Unfinished preceeding store."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); + StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); + } + + EVT EltVT = VTs[j]; + SDValue StVal = OutVals[OIdx]; + if (ExtendIntegerParam) { + assert(VTs.size() == 1 && "Scalar can't have multiple parts."); + // zext/sext to i32 + StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, + dl, MVT::i32, StVal); + } else if (EltVT.getSizeInBits() < 16) { + // Use 16-bit registers for small stores as it's the + // smallest general purpose register size supported by NVPTX. + StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); + } + + // Record the value to store. + StoreOperands.push_back(StVal); + + if (VectorInfo[j] & PVF_LAST) { + unsigned NumElts = StoreOperands.size() - 3; + NVPTXISD::NodeType Op; + switch (NumElts) { + case 1: + Op = NVPTXISD::StoreParam; + break; + case 2: + Op = NVPTXISD::StoreParamV2; + break; + case 4: + Op = NVPTXISD::StoreParamV4; + break; + default: + llvm_unreachable("Invalid vector info."); + } + + StoreOperands.push_back(InFlag); + + // Adjust type of the store op if we've extended the scalar + // return value. + EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j]; + unsigned EltAlign = + NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0; + + Chain = DAG.getMemIntrinsicNode( + Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, + TheStoreType, MachinePointerInfo(), EltAlign); + InFlag = Chain.getValue(1); + + // Cleanup. + StoreOperands.clear(); + } + ++OIdx; + } + assert(StoreOperands.empty() && "Unfinished parameter store."); + if (VTs.size() > 0) + --OIdx; + ++paramCount; + continue; + } + + // ByVal arguments + SmallVector VTs; + SmallVector Offsets; + auto *PTy = dyn_cast(Args[i].Ty); + assert(PTy && "Type of a byval parameter should be pointer"); + ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0); + + // declare .param .align .b8 .param[]; + unsigned sz = Outs[OIdx].Flags.getByValSize(); + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign(); + // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, + // so we don't need to worry about natural alignment or not. + // See TargetLowering::LowerCallTo(). + + // Enforce minumum alignment of 4 to work around ptxas miscompile + // for sm_50+. See corresponding alignment adjustment in + // emitFunctionParamList() for details. + if (ArgAlign < 4) + ArgAlign = 4; + SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), + DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(sz, dl, MVT::i32), InFlag}; + Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, + DeclareParamOps); + InFlag = Chain.getValue(1); + for (unsigned j = 0, je = VTs.size(); j != je; ++j) { + EVT elemtype = VTs[j]; + int curOffset = Offsets[j]; + unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); + auto PtrVT = getPointerTy(DL); + SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], + DAG.getConstant(curOffset, dl, PtrVT)); + SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, + MachinePointerInfo(), PartAlign); + if (elemtype.getSizeInBits() < 16) { + theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); + } + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, + DAG.getConstant(paramCount, dl, MVT::i32), + DAG.getConstant(curOffset, dl, MVT::i32), + theVal, InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, + CopyParamOps, elemtype, + MachinePointerInfo()); + + InFlag = Chain.getValue(1); + } + ++paramCount; + } + + GlobalAddressSDNode *Func = dyn_cast(Callee.getNode()); + unsigned retAlignment = 0; + + // Handle Result + if (Ins.size() > 0) { + SmallVector resvtparts; + ComputeValueVTs(*this, DL, RetTy, resvtparts); + + // Declare + // .param .align 16 .b8 retval0[], or + // .param .b retval0 + unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); + // Emit ".param .b retval0" instead of byte arrays only for + // these three types to match the logic in + // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. + // Plus, this behavior is consistent with nvcc's. + if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy() || + RetTy->isPointerTy()) { + // Scalar needs to be at least 32bit wide + if (resultsz < 32) + resultsz = 32; + SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(resultsz, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32), InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, + DeclareRetOps); + InFlag = Chain.getValue(1); + } else { + retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL); + SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareRetOps[] = { Chain, + DAG.getConstant(retAlignment, dl, MVT::i32), + DAG.getConstant(resultsz / 8, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32), InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, + DeclareRetOps); + InFlag = Chain.getValue(1); + } + } + + if (!Func) { + // This is indirect function call case : PTX requires a prototype of the + // form + // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); + // to be emitted, and the label has to used as the last arg of call + // instruction. + // The prototype is embedded in a string and put as the operand for a + // CallPrototype SDNode which will print out to the value of the string. + SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); + std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS); + const char *ProtoStr = + nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); + SDValue ProtoOps[] = { + Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag, + }; + Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); + InFlag = Chain.getValue(1); + } + // Op to just print "call" + SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue PrintCallOps[] = { + Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag + }; + // We model convergent calls as separate opcodes. + unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall; + if (CLI.IsConvergent) + Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni + : NVPTXISD::PrintConvergentCall; + Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); + InFlag = Chain.getValue(1); + + // Ops to print out the function name + SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallVoidOps[] = { Chain, Callee, InFlag }; + Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); + InFlag = Chain.getValue(1); + + // Ops to print out the param list + SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallArgBeginOps[] = { Chain, InFlag }; + Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, + CallArgBeginOps); + InFlag = Chain.getValue(1); + + for (unsigned i = 0, e = paramCount; i != e; ++i) { + unsigned opcode; + if (i == (e - 1)) + opcode = NVPTXISD::LastCallArg; + else + opcode = NVPTXISD::CallArg; + SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(i, dl, MVT::i32), InFlag }; + Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); + InFlag = Chain.getValue(1); + } + SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CallArgEndOps[] = { Chain, + DAG.getConstant(Func ? 1 : 0, dl, MVT::i32), + InFlag }; + Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); + InFlag = Chain.getValue(1); + + if (!Func) { + SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue PrototypeOps[] = { Chain, + DAG.getConstant(uniqueCallSite, dl, MVT::i32), + InFlag }; + Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); + InFlag = Chain.getValue(1); + } + + // Generate loads from param memory/moves from registers for result + if (Ins.size() > 0) { + SmallVector VTs; + SmallVector Offsets; + ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); + assert(VTs.size() == Ins.size() && "Bad value decomposition"); + + unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL); + auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); + + SmallVector LoadVTs; + int VecIdx = -1; // Index of the first element of the vector. + + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than + // 32-bits are sign extended or zero extended, depending on whether + // they are signed or unsigned types. + bool ExtendIntegerRetVal = + RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; + + for (unsigned i = 0, e = VTs.size(); i != e; ++i) { + bool needTruncate = false; + EVT TheLoadType = VTs[i]; + EVT EltType = Ins[i].VT; + unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]); + if (ExtendIntegerRetVal) { + TheLoadType = MVT::i32; + EltType = MVT::i32; + needTruncate = true; + } else if (TheLoadType.getSizeInBits() < 16) { + if (VTs[i].isInteger()) + needTruncate = true; + EltType = MVT::i16; + } + + // Record index of the very first element of the vector. + if (VectorInfo[i] & PVF_FIRST) { + assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); + VecIdx = i; + } + + LoadVTs.push_back(EltType); + + if (VectorInfo[i] & PVF_LAST) { + unsigned NumElts = LoadVTs.size(); + LoadVTs.push_back(MVT::Other); + LoadVTs.push_back(MVT::Glue); + NVPTXISD::NodeType Op; + switch (NumElts) { + case 1: + Op = NVPTXISD::LoadParam; + break; + case 2: + Op = NVPTXISD::LoadParamV2; + break; + case 4: + Op = NVPTXISD::LoadParamV4; + break; + default: + llvm_unreachable("Invalid vector info."); + } + + SDValue LoadOperands[] = { + Chain, DAG.getConstant(1, dl, MVT::i32), + DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; + SDValue RetVal = DAG.getMemIntrinsicNode( + Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, + MachinePointerInfo(), EltAlign); + + for (unsigned j = 0; j < NumElts; ++j) { + SDValue Ret = RetVal.getValue(j); + if (needTruncate) + Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret); + InVals.push_back(Ret); + } + Chain = RetVal.getValue(NumElts); + InFlag = RetVal.getValue(NumElts + 1); + + // Cleanup + VecIdx = -1; + LoadVTs.clear(); + } + } + } + + Chain = DAG.getCALLSEQ_END(Chain, + DAG.getIntPtrConstant(uniqueCallSite, dl, true), + DAG.getIntPtrConstant(uniqueCallSite + 1, dl, + true), + InFlag, dl); + uniqueCallSite++; + + // set isTailCall to false for now, until we figure out how to express + // tail call optimization in PTX + isTailCall = false; + return Chain; +} + +// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() +// (see LegalizeDAG.cpp). This is slow and uses local memory. +// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 +SDValue +NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + SmallVector Ops; + unsigned NumOperands = Node->getNumOperands(); + for (unsigned i = 0; i < NumOperands; ++i) { + SDValue SubOp = Node->getOperand(i); + EVT VVT = SubOp.getNode()->getValueType(0); + EVT EltVT = VVT.getVectorElementType(); + unsigned NumSubElem = VVT.getVectorNumElements(); + for (unsigned j = 0; j < NumSubElem; ++j) { + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, + DAG.getIntPtrConstant(j, dl))); + } + } + return DAG.getBuildVector(Node->getValueType(0), dl, Ops); +} + +// We can init constant f16x2 with a single .b32 move. Normally it +// would get lowered as two constant loads and vector-packing move. +// mov.b16 %h1, 0x4000; +// mov.b16 %h2, 0x3C00; +// mov.b32 %hh2, {%h2, %h1}; +// Instead we want just a constant move: +// mov.b32 %hh2, 0x40003C00 +// +// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 +// generates good SASS in both cases. +SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + //return Op; + if (!(Op->getValueType(0) == MVT::v2f16 && + isa(Op->getOperand(0)) && + isa(Op->getOperand(1)))) + return Op; + + APInt E0 = + cast(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); + APInt E1 = + cast(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); + SDValue Const = + DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); + return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); +} + +SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Index = Op->getOperand(1); + // Constant index will be matched by tablegen. + if (isa(Index.getNode())) + return Op; + + // Extract individual elements and select one of them. + SDValue Vector = Op->getOperand(0); + EVT VectorVT = Vector.getValueType(); + assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); + EVT EltVT = VectorVT.getVectorElementType(); + + SDLoc dl(Op.getNode()); + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, + DAG.getIntPtrConstant(0, dl)); + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, + DAG.getIntPtrConstant(1, dl)); + return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, + ISD::CondCode::SETEQ); +} + +/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; + + if (VTBits == 32 && STI.getSmVersion() >= 35) { + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} >> Amt + // dHi = aHi >> Amt + // dLo = shf.r.clamp aLo, aHi, Amt + + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + // {dHi, dLo} = {aHi, aLo} >> Amt + // - if (Amt>=size) then + // dLo = aHi >> (Amt-size) + // dHi = aHi >> Amt (this is either all 0 or all 1) + // else + // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) + // dHi = aHi >> Amt + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, dl, MVT::i32), + ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i32), + ISD::SETGE); + SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } +} + +/// LowerShiftLeftParts - Lower SHL_PARTS, which +/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift +/// amount, or +/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift +/// amount. +SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + assert(Op.getOpcode() == ISD::SHL_PARTS); + + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + + if (VTBits == 32 && STI.getSmVersion() >= 35) { + // For 32bit and sm35, we can use the funnel shift 'shf' instruction. + // {dHi, dLo} = {aHi, aLo} << Amt + // dHi = shf.l.clamp aLo, aHi, Amt + // dLo = aLo << Amt + + SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, + ShAmt); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } + else { + // {dHi, dLo} = {aHi, aLo} << Amt + // - if (Amt>=size) then + // dLo = aLo << Amt (all 0) + // dLo = aLo << (Amt-size) + // else + // dLo = aLo << Amt + // dHi = (aHi << Amt) | (aLo >> (size-Amt)) + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(VTBits, dl, MVT::i32), + ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i32)); + SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + + SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i32), + ISD::SETGE); + SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); + } +} + +SDValue +NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + case ISD::RETURNADDR: + return SDValue(); + case ISD::FRAMEADDR: + return SDValue(); + case ISD::GlobalAddress: + return LowerGlobalAddress(Op, DAG); + case ISD::INTRINSIC_W_CHAIN: + return Op; + case ISD::BUILD_VECTOR: + return LowerBUILD_VECTOR(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: + return Op; + case ISD::EXTRACT_VECTOR_ELT: + return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::CONCAT_VECTORS: + return LowerCONCAT_VECTORS(Op, DAG); + case ISD::STORE: + return LowerSTORE(Op, DAG); + case ISD::LOAD: + return LowerLOAD(Op, DAG); + case ISD::SHL_PARTS: + return LowerShiftLeftParts(Op, DAG); + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: + return LowerShiftRightParts(Op, DAG); + case ISD::SELECT: + return LowerSelect(Op, DAG); + default: + llvm_unreachable("Custom lowering not defined for operation"); + } +} + +SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { + SDValue Op0 = Op->getOperand(0); + SDValue Op1 = Op->getOperand(1); + SDValue Op2 = Op->getOperand(2); + SDLoc DL(Op.getNode()); + + assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); + + Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); + Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); + SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); + + return Trunc; +} + +SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() == MVT::i1) + return LowerLOADi1(Op, DAG); + + // v2f16 is legal, so we can't rely on legalizer to handle unaligned + // loads and have to handle it here. + if (Op.getValueType() == MVT::v2f16) { + LoadSDNode *Load = cast(Op); + EVT MemVT = Load->getMemoryVT(); + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, + Load->getAddressSpace(), Load->getAlignment())) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, SDLoc(Op)); + } + } + + return SDValue(); +} + +// v = ld i1* addr +// => +// v1 = ld i8* addr (-> i16) +// v = trunc i16 to i1 +SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { + SDNode *Node = Op.getNode(); + LoadSDNode *LD = cast(Node); + SDLoc dl(Node); + assert(LD->getExtensionType() == ISD::NON_EXTLOAD); + assert(Node->getValueType(0) == MVT::i1 && + "Custom lowering for i1 load only"); + SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), + LD->getPointerInfo(), LD->getAlignment(), + LD->getMemOperand()->getFlags()); + SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); + // The legalizer (the caller) is expecting two values from the legalized + // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() + // in LegalizeDAG.cpp which also uses MergeValues. + SDValue Ops[] = { result, LD->getChain() }; + return DAG.getMergeValues(Ops, dl); +} + +SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + StoreSDNode *Store = cast(Op); + EVT VT = Store->getMemoryVT(); + + if (VT == MVT::i1) + return LowerSTOREi1(Op, DAG); + + // v2f16 is legal, so we can't rely on legalizer to handle unaligned + // stores and have to handle it here. + if (VT == MVT::v2f16 && + !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + Store->getAddressSpace(), Store->getAlignment())) + return expandUnalignedStore(Store, DAG); + + if (VT.isVector()) + return LowerSTOREVector(Op, DAG); + + return SDValue(); +} + +SDValue +NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { + SDNode *N = Op.getNode(); + SDValue Val = N->getOperand(1); + SDLoc DL(N); + EVT ValVT = Val.getValueType(); + + if (ValVT.isVector()) { + // We only handle "native" vector sizes for now, e.g. <4 x double> is not + // legal. We can (and should) split that into 2 stores of <2 x double> here + // but I'm leaving that as a TODO for now. + if (!ValVT.isSimple()) + return SDValue(); + switch (ValVT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v2i8: + case MVT::v2i16: + case MVT::v2i32: + case MVT::v2i64: + case MVT::v2f16: + case MVT::v2f32: + case MVT::v2f64: + case MVT::v4i8: + case MVT::v4i16: + case MVT::v4i32: + case MVT::v4f16: + case MVT::v4f32: + case MVT::v8f16: // <4 x f16x2> + // This is a "native" vector type + break; + } + + MemSDNode *MemSD = cast(N); + const DataLayout &TD = DAG.getDataLayout(); + + unsigned Align = MemSD->getAlignment(); + unsigned PrefAlign = + TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext())); + if (Align < PrefAlign) { + // This store is not sufficiently aligned, so bail out and let this vector + // store be scalarized. Note that we may still be able to emit smaller + // vector stores. For example, if we are storing a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return SDValue(); + } + + unsigned Opcode = 0; + EVT EltVT = ValVT.getVectorElementType(); + unsigned NumElts = ValVT.getVectorNumElements(); + + // Since StoreV2 is a target node, we cannot rely on DAG type legalization. + // Therefore, we must ensure the type is legal. For i1 and i8, we set the + // stored type to i16 and propagate the "real" type as the memory type. + bool NeedExt = false; + if (EltVT.getSizeInBits() < 16) + NeedExt = true; + + bool StoreF16x2 = false; + switch (NumElts) { + default: + return SDValue(); + case 2: + Opcode = NVPTXISD::StoreV2; + break; + case 4: + Opcode = NVPTXISD::StoreV4; + break; + case 8: + // v8f16 is a special case. PTX doesn't have st.v8.f16 + // instruction. Instead, we split the vector into v2f16 chunks and + // store them with st.v4.b32. + assert(EltVT == MVT::f16 && "Wrong type for the vector."); + Opcode = NVPTXISD::StoreV4; + StoreF16x2 = true; + break; + } + + SmallVector Ops; + + // First is the chain + Ops.push_back(N->getOperand(0)); + + if (StoreF16x2) { + // Combine f16,f16 -> v2f16 + NumElts /= 2; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, + DAG.getIntPtrConstant(i * 2, DL)); + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, + DAG.getIntPtrConstant(i * 2 + 1, DL)); + SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1); + Ops.push_back(V2); + } + } else { + // Then the split values + for (unsigned i = 0; i < NumElts; ++i) { + SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, + DAG.getIntPtrConstant(i, DL)); + if (NeedExt) + ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); + Ops.push_back(ExtVal); + } + } + + // Then any remaining arguments + Ops.append(N->op_begin() + 2, N->op_end()); + + SDValue NewSt = + DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, + MemSD->getMemoryVT(), MemSD->getMemOperand()); + + // return DCI.CombineTo(N, NewSt, true); + return NewSt; + } + + return SDValue(); +} + +// st i1 v, addr +// => +// v1 = zxt v to i16 +// st.u8 i16, addr +SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + StoreSDNode *ST = cast(Node); + SDValue Tmp1 = ST->getChain(); + SDValue Tmp2 = ST->getBasePtr(); + SDValue Tmp3 = ST->getValue(); + assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); + Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); + SDValue Result = + DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, + ST->getAlignment(), ST->getMemOperand()->getFlags()); + return Result; +} + +SDValue +NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { + std::string ParamSym; + raw_string_ostream ParamStr(ParamSym); + + ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx; + ParamStr.flush(); + + std::string *SavedStr = + nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str()); + return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); +} + +// Check to see if the kernel argument is image*_t or sampler_t + +static bool isImageOrSamplerVal(const Value *arg, const Module *context) { + static const char *const specialTypes[] = { "struct._image2d_t", + "struct._image3d_t", + "struct._sampler_t" }; + + Type *Ty = arg->getType(); + auto *PTy = dyn_cast(Ty); + + if (!PTy) + return false; + + if (!context) + return false; + + auto *STy = dyn_cast(PTy->getElementType()); + if (!STy || STy->isLiteral()) + return false; + + return std::find(std::begin(specialTypes), std::end(specialTypes), + STy->getName()) != std::end(specialTypes); +} + +SDValue NVPTXTargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + const DataLayout &DL = DAG.getDataLayout(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + const Function *F = MF.getFunction(); + const AttributeList &PAL = F->getAttributes(); + const TargetLowering *TLI = STI.getTargetLowering(); + + SDValue Root = DAG.getRoot(); + std::vector OutChains; + + bool isABI = (STI.getSmVersion() >= 20); + assert(isABI && "Non-ABI compilation is not supported"); + if (!isABI) + return Chain; + + std::vector argTypes; + std::vector theArgs; + for (const Argument &I : F->args()) { + theArgs.push_back(&I); + argTypes.push_back(I.getType()); + } + // argTypes.size() (or theArgs.size()) and Ins.size() need not match. + // Ins.size() will be larger + // * if there is an aggregate argument with multiple fields (each field + // showing up separately in Ins) + // * if there is a vector argument with more than typical vector-length + // elements (generally if more than 4) where each vector element is + // individually present in Ins. + // So a different index should be used for indexing into Ins. + // See similar issue in LowerCall. + unsigned InsIdx = 0; + + int idx = 0; + for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { + Type *Ty = argTypes[i]; + + // If the kernel argument is image*_t or sampler_t, convert it to + // a i32 constant holding the parameter position. This can later + // matched in the AsmPrinter to output the correct mangled name. + if (isImageOrSamplerVal( + theArgs[i], + (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() + : nullptr))) { + assert(isKernelFunction(*F) && + "Only kernels can have image/sampler params"); + InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); + continue; + } + + if (theArgs[i]->use_empty()) { + // argument is dead + if (Ty->isAggregateType()) { + SmallVector vtparts; + + ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); + assert(vtparts.size() > 0 && "empty aggregate type not expected"); + for (unsigned parti = 0, parte = vtparts.size(); parti != parte; + ++parti) { + InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); + ++InsIdx; + } + if (vtparts.size() > 0) + --InsIdx; + continue; + } + if (Ty->isVectorTy()) { + EVT ObjectVT = getValueType(DL, Ty); + unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); + for (unsigned parti = 0; parti < NumRegs; ++parti) { + InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); + ++InsIdx; + } + if (NumRegs > 0) + --InsIdx; + continue; + } + InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); + continue; + } + + // In the following cases, assign a node order of "idx+1" + // to newly created nodes. The SDNodes for params have to + // appear in the same order as their order of appearance + // in the original function. "idx+1" holds that order. + if (!PAL.hasParamAttribute(i, Attribute::ByVal)) { + bool aggregateIsPacked = false; + if (StructType *STy = dyn_cast(Ty)) + aggregateIsPacked = STy->isPacked(); + + SmallVector VTs; + SmallVector Offsets; + ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); + assert(VTs.size() > 0 && "Unexpected empty type."); + auto VectorInfo = + VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty)); + + SDValue Arg = getParamSymbol(DAG, idx, PtrVT); + int VecIdx = -1; // Index of the first element of the current vector. + for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { + if (VectorInfo[parti] & PVF_FIRST) { + assert(VecIdx == -1 && "Orphaned vector."); + VecIdx = parti; + } + + // That's the last element of this store op. + if (VectorInfo[parti] & PVF_LAST) { + unsigned NumElts = parti - VecIdx + 1; + EVT EltVT = VTs[parti]; + // i1 is loaded/stored as i8. + EVT LoadVT = EltVT; + if (EltVT == MVT::i1) + LoadVT = MVT::i8; + else if (EltVT == MVT::v2f16) + // getLoad needs a vector type, but it can't handle + // vectors which contain v2f16 elements. So we must load + // using i32 here and then bitcast back. + LoadVT = MVT::i32; + + EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); + SDValue VecAddr = + DAG.getNode(ISD::ADD, dl, PtrVT, Arg, + DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); + Value *srcValue = Constant::getNullValue(PointerType::get( + EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); + SDValue P = + DAG.getLoad(VecVT, dl, Root, VecAddr, + MachinePointerInfo(srcValue), aggregateIsPacked, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); + if (P.getNode()) + P.getNode()->setIROrder(idx + 1); + for (unsigned j = 0; j < NumElts; ++j) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, + DAG.getIntPtrConstant(j, dl)); + // We've loaded i1 as an i8 and now must truncate it back to i1 + if (EltVT == MVT::i1) + Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); + // v2f16 was loaded as an i32. Now we must bitcast it back. + else if (EltVT == MVT::v2f16) + Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); + // Extend the element if necesary (e.g. an i8 is loaded + // into an i16 register) + if (Ins[InsIdx].VT.isInteger() && + Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) { + unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; + Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); + } + InVals.push_back(Elt); + } + + // Reset vector tracking state. + VecIdx = -1; + } + ++InsIdx; + } + if (VTs.size() > 0) + --InsIdx; + continue; + } + + // Param has ByVal attribute + // Return MoveParam(param symbol). + // Ideally, the param symbol can be returned directly, + // but when SDNode builder decides to use it in a CopyToReg(), + // machine instruction fails because TargetExternalSymbol + // (not lowered) is target dependent, and CopyToReg assumes + // the source is lowered. + EVT ObjectVT = getValueType(DL, Ty); + assert(ObjectVT == Ins[InsIdx].VT && + "Ins type did not match function type"); + SDValue Arg = getParamSymbol(DAG, idx, PtrVT); + SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); + if (p.getNode()) + p.getNode()->setIROrder(idx + 1); + InVals.push_back(p); + } + + // Clang will check explicit VarArg and issue error if any. However, Clang + // will let code with + // implicit var arg like f() pass. See bug 617733. + // We treat this case as if the arg list is empty. + // if (F.isVarArg()) { + // assert(0 && "VarArg not supported yet!"); + //} + + if (!OutChains.empty()) + DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); + + return Chain; +} + +SDValue +NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SDLoc &dl, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + Type *RetTy = MF.getFunction()->getReturnType(); + + bool isABI = (STI.getSmVersion() >= 20); + assert(isABI && "Non-ABI compilation is not supported"); + if (!isABI) + return Chain; + + const DataLayout DL = DAG.getDataLayout(); + SmallVector VTs; + SmallVector Offsets; + ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); + assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); + + auto VectorInfo = VectorizePTXValueVTs( + VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1); + + // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than + // 32-bits are sign extended or zero extended, depending on whether + // they are signed or unsigned types. + bool ExtendIntegerRetVal = + RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; + + SmallVector StoreOperands; + for (unsigned i = 0, e = VTs.size(); i != e; ++i) { + // New load/store. Record chain and offset operands. + if (VectorInfo[i] & PVF_FIRST) { + assert(StoreOperands.empty() && "Orphaned operand list."); + StoreOperands.push_back(Chain); + StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); + } + + SDValue RetVal = OutVals[i]; + if (ExtendIntegerRetVal) { + RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, + dl, MVT::i32, RetVal); + } else if (RetVal.getValueSizeInBits() < 16) { + // Use 16-bit registers for small load-stores as it's the + // smallest general purpose register size supported by NVPTX. + RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); + } + + // Record the value to return. + StoreOperands.push_back(RetVal); + + // That's the last element of this store op. + if (VectorInfo[i] & PVF_LAST) { + NVPTXISD::NodeType Op; + unsigned NumElts = StoreOperands.size() - 2; + switch (NumElts) { + case 1: + Op = NVPTXISD::StoreRetval; + break; + case 2: + Op = NVPTXISD::StoreRetvalV2; + break; + case 4: + Op = NVPTXISD::StoreRetvalV4; + break; + default: + llvm_unreachable("Invalid vector info."); + } + + // Adjust type of load/store op if we've extended the scalar + // return value. + EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; + Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other), + StoreOperands, TheStoreType, + MachinePointerInfo(), 1); + // Cleanup vector state. + StoreOperands.clear(); + } + } + + return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); +} + +void NVPTXTargetLowering::LowerAsmOperandForConstraint( + SDValue Op, std::string &Constraint, std::vector &Ops, + SelectionDAG &DAG) const { + if (Constraint.length() > 1) + return; + else + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +static unsigned getOpcForTextureInstr(unsigned Intrinsic) { + switch (Intrinsic) { + default: + return 0; + + case Intrinsic::nvvm_tex_1d_v4f32_s32: + return NVPTXISD::Tex1DFloatS32; + case Intrinsic::nvvm_tex_1d_v4f32_f32: + return NVPTXISD::Tex1DFloatFloat; + case Intrinsic::nvvm_tex_1d_level_v4f32_f32: + return NVPTXISD::Tex1DFloatFloatLevel; + case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: + return NVPTXISD::Tex1DFloatFloatGrad; + case Intrinsic::nvvm_tex_1d_v4s32_s32: + return NVPTXISD::Tex1DS32S32; + case Intrinsic::nvvm_tex_1d_v4s32_f32: + return NVPTXISD::Tex1DS32Float; + case Intrinsic::nvvm_tex_1d_level_v4s32_f32: + return NVPTXISD::Tex1DS32FloatLevel; + case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: + return NVPTXISD::Tex1DS32FloatGrad; + case Intrinsic::nvvm_tex_1d_v4u32_s32: + return NVPTXISD::Tex1DU32S32; + case Intrinsic::nvvm_tex_1d_v4u32_f32: + return NVPTXISD::Tex1DU32Float; + case Intrinsic::nvvm_tex_1d_level_v4u32_f32: + return NVPTXISD::Tex1DU32FloatLevel; + case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: + return NVPTXISD::Tex1DU32FloatGrad; + + case Intrinsic::nvvm_tex_1d_array_v4f32_s32: + return NVPTXISD::Tex1DArrayFloatS32; + case Intrinsic::nvvm_tex_1d_array_v4f32_f32: + return NVPTXISD::Tex1DArrayFloatFloat; + case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: + return NVPTXISD::Tex1DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: + return NVPTXISD::Tex1DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_1d_array_v4s32_s32: + return NVPTXISD::Tex1DArrayS32S32; + case Intrinsic::nvvm_tex_1d_array_v4s32_f32: + return NVPTXISD::Tex1DArrayS32Float; + case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: + return NVPTXISD::Tex1DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: + return NVPTXISD::Tex1DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_1d_array_v4u32_s32: + return NVPTXISD::Tex1DArrayU32S32; + case Intrinsic::nvvm_tex_1d_array_v4u32_f32: + return NVPTXISD::Tex1DArrayU32Float; + case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: + return NVPTXISD::Tex1DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: + return NVPTXISD::Tex1DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_2d_v4f32_s32: + return NVPTXISD::Tex2DFloatS32; + case Intrinsic::nvvm_tex_2d_v4f32_f32: + return NVPTXISD::Tex2DFloatFloat; + case Intrinsic::nvvm_tex_2d_level_v4f32_f32: + return NVPTXISD::Tex2DFloatFloatLevel; + case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: + return NVPTXISD::Tex2DFloatFloatGrad; + case Intrinsic::nvvm_tex_2d_v4s32_s32: + return NVPTXISD::Tex2DS32S32; + case Intrinsic::nvvm_tex_2d_v4s32_f32: + return NVPTXISD::Tex2DS32Float; + case Intrinsic::nvvm_tex_2d_level_v4s32_f32: + return NVPTXISD::Tex2DS32FloatLevel; + case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: + return NVPTXISD::Tex2DS32FloatGrad; + case Intrinsic::nvvm_tex_2d_v4u32_s32: + return NVPTXISD::Tex2DU32S32; + case Intrinsic::nvvm_tex_2d_v4u32_f32: + return NVPTXISD::Tex2DU32Float; + case Intrinsic::nvvm_tex_2d_level_v4u32_f32: + return NVPTXISD::Tex2DU32FloatLevel; + case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: + return NVPTXISD::Tex2DU32FloatGrad; + + case Intrinsic::nvvm_tex_2d_array_v4f32_s32: + return NVPTXISD::Tex2DArrayFloatS32; + case Intrinsic::nvvm_tex_2d_array_v4f32_f32: + return NVPTXISD::Tex2DArrayFloatFloat; + case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: + return NVPTXISD::Tex2DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: + return NVPTXISD::Tex2DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_2d_array_v4s32_s32: + return NVPTXISD::Tex2DArrayS32S32; + case Intrinsic::nvvm_tex_2d_array_v4s32_f32: + return NVPTXISD::Tex2DArrayS32Float; + case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: + return NVPTXISD::Tex2DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: + return NVPTXISD::Tex2DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_2d_array_v4u32_s32: + return NVPTXISD::Tex2DArrayU32S32; + case Intrinsic::nvvm_tex_2d_array_v4u32_f32: + return NVPTXISD::Tex2DArrayU32Float; + case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: + return NVPTXISD::Tex2DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: + return NVPTXISD::Tex2DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_3d_v4f32_s32: + return NVPTXISD::Tex3DFloatS32; + case Intrinsic::nvvm_tex_3d_v4f32_f32: + return NVPTXISD::Tex3DFloatFloat; + case Intrinsic::nvvm_tex_3d_level_v4f32_f32: + return NVPTXISD::Tex3DFloatFloatLevel; + case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: + return NVPTXISD::Tex3DFloatFloatGrad; + case Intrinsic::nvvm_tex_3d_v4s32_s32: + return NVPTXISD::Tex3DS32S32; + case Intrinsic::nvvm_tex_3d_v4s32_f32: + return NVPTXISD::Tex3DS32Float; + case Intrinsic::nvvm_tex_3d_level_v4s32_f32: + return NVPTXISD::Tex3DS32FloatLevel; + case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: + return NVPTXISD::Tex3DS32FloatGrad; + case Intrinsic::nvvm_tex_3d_v4u32_s32: + return NVPTXISD::Tex3DU32S32; + case Intrinsic::nvvm_tex_3d_v4u32_f32: + return NVPTXISD::Tex3DU32Float; + case Intrinsic::nvvm_tex_3d_level_v4u32_f32: + return NVPTXISD::Tex3DU32FloatLevel; + case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: + return NVPTXISD::Tex3DU32FloatGrad; + + case Intrinsic::nvvm_tex_cube_v4f32_f32: + return NVPTXISD::TexCubeFloatFloat; + case Intrinsic::nvvm_tex_cube_level_v4f32_f32: + return NVPTXISD::TexCubeFloatFloatLevel; + case Intrinsic::nvvm_tex_cube_v4s32_f32: + return NVPTXISD::TexCubeS32Float; + case Intrinsic::nvvm_tex_cube_level_v4s32_f32: + return NVPTXISD::TexCubeS32FloatLevel; + case Intrinsic::nvvm_tex_cube_v4u32_f32: + return NVPTXISD::TexCubeU32Float; + case Intrinsic::nvvm_tex_cube_level_v4u32_f32: + return NVPTXISD::TexCubeU32FloatLevel; + + case Intrinsic::nvvm_tex_cube_array_v4f32_f32: + return NVPTXISD::TexCubeArrayFloatFloat; + case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: + return NVPTXISD::TexCubeArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_cube_array_v4s32_f32: + return NVPTXISD::TexCubeArrayS32Float; + case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: + return NVPTXISD::TexCubeArrayS32FloatLevel; + case Intrinsic::nvvm_tex_cube_array_v4u32_f32: + return NVPTXISD::TexCubeArrayU32Float; + case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: + return NVPTXISD::TexCubeArrayU32FloatLevel; + + case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: + return NVPTXISD::Tld4R2DFloatFloat; + case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: + return NVPTXISD::Tld4G2DFloatFloat; + case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: + return NVPTXISD::Tld4B2DFloatFloat; + case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: + return NVPTXISD::Tld4A2DFloatFloat; + case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: + return NVPTXISD::Tld4R2DS64Float; + case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: + return NVPTXISD::Tld4G2DS64Float; + case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: + return NVPTXISD::Tld4B2DS64Float; + case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: + return NVPTXISD::Tld4A2DS64Float; + case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: + return NVPTXISD::Tld4R2DU64Float; + case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: + return NVPTXISD::Tld4G2DU64Float; + case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: + return NVPTXISD::Tld4B2DU64Float; + case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: + return NVPTXISD::Tld4A2DU64Float; + + case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: + return NVPTXISD::TexUnified1DFloatS32; + case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloat; + case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: + return NVPTXISD::TexUnified1DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: + return NVPTXISD::TexUnified1DS32S32; + case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: + return NVPTXISD::TexUnified1DS32Float; + case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: + return NVPTXISD::TexUnified1DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: + return NVPTXISD::TexUnified1DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: + return NVPTXISD::TexUnified1DU32S32; + case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: + return NVPTXISD::TexUnified1DU32Float; + case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: + return NVPTXISD::TexUnified1DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: + return NVPTXISD::TexUnified1DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: + return NVPTXISD::TexUnified1DArrayFloatS32; + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: + return NVPTXISD::TexUnified1DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: + return NVPTXISD::TexUnified1DArrayS32S32; + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32Float; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: + return NVPTXISD::TexUnified1DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: + return NVPTXISD::TexUnified1DArrayU32S32; + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32Float; + case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: + return NVPTXISD::TexUnified1DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: + return NVPTXISD::TexUnified2DFloatS32; + case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloat; + case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: + return NVPTXISD::TexUnified2DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: + return NVPTXISD::TexUnified2DS32S32; + case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: + return NVPTXISD::TexUnified2DS32Float; + case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: + return NVPTXISD::TexUnified2DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: + return NVPTXISD::TexUnified2DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: + return NVPTXISD::TexUnified2DU32S32; + case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: + return NVPTXISD::TexUnified2DU32Float; + case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: + return NVPTXISD::TexUnified2DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: + return NVPTXISD::TexUnified2DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: + return NVPTXISD::TexUnified2DArrayFloatS32; + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: + return NVPTXISD::TexUnified2DArrayFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: + return NVPTXISD::TexUnified2DArrayS32S32; + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32Float; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: + return NVPTXISD::TexUnified2DArrayS32FloatGrad; + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: + return NVPTXISD::TexUnified2DArrayU32S32; + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32Float; + case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32FloatLevel; + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: + return NVPTXISD::TexUnified2DArrayU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: + return NVPTXISD::TexUnified3DFloatS32; + case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloat; + case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: + return NVPTXISD::TexUnified3DFloatFloatGrad; + case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: + return NVPTXISD::TexUnified3DS32S32; + case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: + return NVPTXISD::TexUnified3DS32Float; + case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: + return NVPTXISD::TexUnified3DS32FloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: + return NVPTXISD::TexUnified3DS32FloatGrad; + case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: + return NVPTXISD::TexUnified3DU32S32; + case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: + return NVPTXISD::TexUnified3DU32Float; + case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: + return NVPTXISD::TexUnified3DU32FloatLevel; + case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: + return NVPTXISD::TexUnified3DU32FloatGrad; + + case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: + return NVPTXISD::TexUnifiedCubeFloatFloat; + case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: + return NVPTXISD::TexUnifiedCubeFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: + return NVPTXISD::TexUnifiedCubeS32Float; + case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: + return NVPTXISD::TexUnifiedCubeS32FloatLevel; + case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: + return NVPTXISD::TexUnifiedCubeU32Float; + case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: + return NVPTXISD::TexUnifiedCubeU32FloatLevel; + + case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: + return NVPTXISD::TexUnifiedCubeArrayFloatFloat; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: + return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; + case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: + return NVPTXISD::TexUnifiedCubeArrayS32Float; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: + return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; + case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: + return NVPTXISD::TexUnifiedCubeArrayU32Float; + case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: + return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; + + case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedR2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedG2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedB2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: + return NVPTXISD::Tld4UnifiedA2DFloatFloat; + case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedR2DS64Float; + case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedG2DS64Float; + case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedB2DS64Float; + case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: + return NVPTXISD::Tld4UnifiedA2DS64Float; + case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedR2DU64Float; + case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedG2DU64Float; + case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedB2DU64Float; + case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: + return NVPTXISD::Tld4UnifiedA2DU64Float; + } +} + +static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { + switch (Intrinsic) { + default: + return 0; + case Intrinsic::nvvm_suld_1d_i8_clamp: + return NVPTXISD::Suld1DI8Clamp; + case Intrinsic::nvvm_suld_1d_i16_clamp: + return NVPTXISD::Suld1DI16Clamp; + case Intrinsic::nvvm_suld_1d_i32_clamp: + return NVPTXISD::Suld1DI32Clamp; + case Intrinsic::nvvm_suld_1d_i64_clamp: + return NVPTXISD::Suld1DI64Clamp; + case Intrinsic::nvvm_suld_1d_v2i8_clamp: + return NVPTXISD::Suld1DV2I8Clamp; + case Intrinsic::nvvm_suld_1d_v2i16_clamp: + return NVPTXISD::Suld1DV2I16Clamp; + case Intrinsic::nvvm_suld_1d_v2i32_clamp: + return NVPTXISD::Suld1DV2I32Clamp; + case Intrinsic::nvvm_suld_1d_v2i64_clamp: + return NVPTXISD::Suld1DV2I64Clamp; + case Intrinsic::nvvm_suld_1d_v4i8_clamp: + return NVPTXISD::Suld1DV4I8Clamp; + case Intrinsic::nvvm_suld_1d_v4i16_clamp: + return NVPTXISD::Suld1DV4I16Clamp; + case Intrinsic::nvvm_suld_1d_v4i32_clamp: + return NVPTXISD::Suld1DV4I32Clamp; + case Intrinsic::nvvm_suld_1d_array_i8_clamp: + return NVPTXISD::Suld1DArrayI8Clamp; + case Intrinsic::nvvm_suld_1d_array_i16_clamp: + return NVPTXISD::Suld1DArrayI16Clamp; + case Intrinsic::nvvm_suld_1d_array_i32_clamp: + return NVPTXISD::Suld1DArrayI32Clamp; + case Intrinsic::nvvm_suld_1d_array_i64_clamp: + return NVPTXISD::Suld1DArrayI64Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: + return NVPTXISD::Suld1DArrayV2I8Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: + return NVPTXISD::Suld1DArrayV2I16Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: + return NVPTXISD::Suld1DArrayV2I32Clamp; + case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: + return NVPTXISD::Suld1DArrayV2I64Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: + return NVPTXISD::Suld1DArrayV4I8Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: + return NVPTXISD::Suld1DArrayV4I16Clamp; + case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: + return NVPTXISD::Suld1DArrayV4I32Clamp; + case Intrinsic::nvvm_suld_2d_i8_clamp: + return NVPTXISD::Suld2DI8Clamp; + case Intrinsic::nvvm_suld_2d_i16_clamp: + return NVPTXISD::Suld2DI16Clamp; + case Intrinsic::nvvm_suld_2d_i32_clamp: + return NVPTXISD::Suld2DI32Clamp; + case Intrinsic::nvvm_suld_2d_i64_clamp: + return NVPTXISD::Suld2DI64Clamp; + case Intrinsic::nvvm_suld_2d_v2i8_clamp: + return NVPTXISD::Suld2DV2I8Clamp; + case Intrinsic::nvvm_suld_2d_v2i16_clamp: + return NVPTXISD::Suld2DV2I16Clamp; + case Intrinsic::nvvm_suld_2d_v2i32_clamp: + return NVPTXISD::Suld2DV2I32Clamp; + case Intrinsic::nvvm_suld_2d_v2i64_clamp: + return NVPTXISD::Suld2DV2I64Clamp; + case Intrinsic::nvvm_suld_2d_v4i8_clamp: + return NVPTXISD::Suld2DV4I8Clamp; + case Intrinsic::nvvm_suld_2d_v4i16_clamp: + return NVPTXISD::Suld2DV4I16Clamp; + case Intrinsic::nvvm_suld_2d_v4i32_clamp: + return NVPTXISD::Suld2DV4I32Clamp; + case Intrinsic::nvvm_suld_2d_array_i8_clamp: + return NVPTXISD::Suld2DArrayI8Clamp; + case Intrinsic::nvvm_suld_2d_array_i16_clamp: + return NVPTXISD::Suld2DArrayI16Clamp; + case Intrinsic::nvvm_suld_2d_array_i32_clamp: + return NVPTXISD::Suld2DArrayI32Clamp; + case Intrinsic::nvvm_suld_2d_array_i64_clamp: + return NVPTXISD::Suld2DArrayI64Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: + return NVPTXISD::Suld2DArrayV2I8Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: + return NVPTXISD::Suld2DArrayV2I16Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: + return NVPTXISD::Suld2DArrayV2I32Clamp; + case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: + return NVPTXISD::Suld2DArrayV2I64Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: + return NVPTXISD::Suld2DArrayV4I8Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: + return NVPTXISD::Suld2DArrayV4I16Clamp; + case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: + return NVPTXISD::Suld2DArrayV4I32Clamp; + case Intrinsic::nvvm_suld_3d_i8_clamp: + return NVPTXISD::Suld3DI8Clamp; + case Intrinsic::nvvm_suld_3d_i16_clamp: + return NVPTXISD::Suld3DI16Clamp; + case Intrinsic::nvvm_suld_3d_i32_clamp: + return NVPTXISD::Suld3DI32Clamp; + case Intrinsic::nvvm_suld_3d_i64_clamp: + return NVPTXISD::Suld3DI64Clamp; + case Intrinsic::nvvm_suld_3d_v2i8_clamp: + return NVPTXISD::Suld3DV2I8Clamp; + case Intrinsic::nvvm_suld_3d_v2i16_clamp: + return NVPTXISD::Suld3DV2I16Clamp; + case Intrinsic::nvvm_suld_3d_v2i32_clamp: + return NVPTXISD::Suld3DV2I32Clamp; + case Intrinsic::nvvm_suld_3d_v2i64_clamp: + return NVPTXISD::Suld3DV2I64Clamp; + case Intrinsic::nvvm_suld_3d_v4i8_clamp: + return NVPTXISD::Suld3DV4I8Clamp; + case Intrinsic::nvvm_suld_3d_v4i16_clamp: + return NVPTXISD::Suld3DV4I16Clamp; + case Intrinsic::nvvm_suld_3d_v4i32_clamp: + return NVPTXISD::Suld3DV4I32Clamp; + case Intrinsic::nvvm_suld_1d_i8_trap: + return NVPTXISD::Suld1DI8Trap; + case Intrinsic::nvvm_suld_1d_i16_trap: + return NVPTXISD::Suld1DI16Trap; + case Intrinsic::nvvm_suld_1d_i32_trap: + return NVPTXISD::Suld1DI32Trap; + case Intrinsic::nvvm_suld_1d_i64_trap: + return NVPTXISD::Suld1DI64Trap; + case Intrinsic::nvvm_suld_1d_v2i8_trap: + return NVPTXISD::Suld1DV2I8Trap; + case Intrinsic::nvvm_suld_1d_v2i16_trap: + return NVPTXISD::Suld1DV2I16Trap; + case Intrinsic::nvvm_suld_1d_v2i32_trap: + return NVPTXISD::Suld1DV2I32Trap; + case Intrinsic::nvvm_suld_1d_v2i64_trap: + return NVPTXISD::Suld1DV2I64Trap; + case Intrinsic::nvvm_suld_1d_v4i8_trap: + return NVPTXISD::Suld1DV4I8Trap; + case Intrinsic::nvvm_suld_1d_v4i16_trap: + return NVPTXISD::Suld1DV4I16Trap; + case Intrinsic::nvvm_suld_1d_v4i32_trap: + return NVPTXISD::Suld1DV4I32Trap; + case Intrinsic::nvvm_suld_1d_array_i8_trap: + return NVPTXISD::Suld1DArrayI8Trap; + case Intrinsic::nvvm_suld_1d_array_i16_trap: + return NVPTXISD::Suld1DArrayI16Trap; + case Intrinsic::nvvm_suld_1d_array_i32_trap: + return NVPTXISD::Suld1DArrayI32Trap; + case Intrinsic::nvvm_suld_1d_array_i64_trap: + return NVPTXISD::Suld1DArrayI64Trap; + case Intrinsic::nvvm_suld_1d_array_v2i8_trap: + return NVPTXISD::Suld1DArrayV2I8Trap; + case Intrinsic::nvvm_suld_1d_array_v2i16_trap: + return NVPTXISD::Suld1DArrayV2I16Trap; + case Intrinsic::nvvm_suld_1d_array_v2i32_trap: + return NVPTXISD::Suld1DArrayV2I32Trap; + case Intrinsic::nvvm_suld_1d_array_v2i64_trap: + return NVPTXISD::Suld1DArrayV2I64Trap; + case Intrinsic::nvvm_suld_1d_array_v4i8_trap: + return NVPTXISD::Suld1DArrayV4I8Trap; + case Intrinsic::nvvm_suld_1d_array_v4i16_trap: + return NVPTXISD::Suld1DArrayV4I16Trap; + case Intrinsic::nvvm_suld_1d_array_v4i32_trap: + return NVPTXISD::Suld1DArrayV4I32Trap; + case Intrinsic::nvvm_suld_2d_i8_trap: + return NVPTXISD::Suld2DI8Trap; + case Intrinsic::nvvm_suld_2d_i16_trap: + return NVPTXISD::Suld2DI16Trap; + case Intrinsic::nvvm_suld_2d_i32_trap: + return NVPTXISD::Suld2DI32Trap; + case Intrinsic::nvvm_suld_2d_i64_trap: + return NVPTXISD::Suld2DI64Trap; + case Intrinsic::nvvm_suld_2d_v2i8_trap: + return NVPTXISD::Suld2DV2I8Trap; + case Intrinsic::nvvm_suld_2d_v2i16_trap: + return NVPTXISD::Suld2DV2I16Trap; + case Intrinsic::nvvm_suld_2d_v2i32_trap: + return NVPTXISD::Suld2DV2I32Trap; + case Intrinsic::nvvm_suld_2d_v2i64_trap: + return NVPTXISD::Suld2DV2I64Trap; + case Intrinsic::nvvm_suld_2d_v4i8_trap: + return NVPTXISD::Suld2DV4I8Trap; + case Intrinsic::nvvm_suld_2d_v4i16_trap: + return NVPTXISD::Suld2DV4I16Trap; + case Intrinsic::nvvm_suld_2d_v4i32_trap: + return NVPTXISD::Suld2DV4I32Trap; + case Intrinsic::nvvm_suld_2d_array_i8_trap: + return NVPTXISD::Suld2DArrayI8Trap; + case Intrinsic::nvvm_suld_2d_array_i16_trap: + return NVPTXISD::Suld2DArrayI16Trap; + case Intrinsic::nvvm_suld_2d_array_i32_trap: + return NVPTXISD::Suld2DArrayI32Trap; + case Intrinsic::nvvm_suld_2d_array_i64_trap: + return NVPTXISD::Suld2DArrayI64Trap; + case Intrinsic::nvvm_suld_2d_array_v2i8_trap: + return NVPTXISD::Suld2DArrayV2I8Trap; + case Intrinsic::nvvm_suld_2d_array_v2i16_trap: + return NVPTXISD::Suld2DArrayV2I16Trap; + case Intrinsic::nvvm_suld_2d_array_v2i32_trap: + return NVPTXISD::Suld2DArrayV2I32Trap; + case Intrinsic::nvvm_suld_2d_array_v2i64_trap: + return NVPTXISD::Suld2DArrayV2I64Trap; + case Intrinsic::nvvm_suld_2d_array_v4i8_trap: + return NVPTXISD::Suld2DArrayV4I8Trap; + case Intrinsic::nvvm_suld_2d_array_v4i16_trap: + return NVPTXISD::Suld2DArrayV4I16Trap; + case Intrinsic::nvvm_suld_2d_array_v4i32_trap: + return NVPTXISD::Suld2DArrayV4I32Trap; + case Intrinsic::nvvm_suld_3d_i8_trap: + return NVPTXISD::Suld3DI8Trap; + case Intrinsic::nvvm_suld_3d_i16_trap: + return NVPTXISD::Suld3DI16Trap; + case Intrinsic::nvvm_suld_3d_i32_trap: + return NVPTXISD::Suld3DI32Trap; + case Intrinsic::nvvm_suld_3d_i64_trap: + return NVPTXISD::Suld3DI64Trap; + case Intrinsic::nvvm_suld_3d_v2i8_trap: + return NVPTXISD::Suld3DV2I8Trap; + case Intrinsic::nvvm_suld_3d_v2i16_trap: + return NVPTXISD::Suld3DV2I16Trap; + case Intrinsic::nvvm_suld_3d_v2i32_trap: + return NVPTXISD::Suld3DV2I32Trap; + case Intrinsic::nvvm_suld_3d_v2i64_trap: + return NVPTXISD::Suld3DV2I64Trap; + case Intrinsic::nvvm_suld_3d_v4i8_trap: + return NVPTXISD::Suld3DV4I8Trap; + case Intrinsic::nvvm_suld_3d_v4i16_trap: + return NVPTXISD::Suld3DV4I16Trap; + case Intrinsic::nvvm_suld_3d_v4i32_trap: + return NVPTXISD::Suld3DV4I32Trap; + case Intrinsic::nvvm_suld_1d_i8_zero: + return NVPTXISD::Suld1DI8Zero; + case Intrinsic::nvvm_suld_1d_i16_zero: + return NVPTXISD::Suld1DI16Zero; + case Intrinsic::nvvm_suld_1d_i32_zero: + return NVPTXISD::Suld1DI32Zero; + case Intrinsic::nvvm_suld_1d_i64_zero: + return NVPTXISD::Suld1DI64Zero; + case Intrinsic::nvvm_suld_1d_v2i8_zero: + return NVPTXISD::Suld1DV2I8Zero; + case Intrinsic::nvvm_suld_1d_v2i16_zero: + return NVPTXISD::Suld1DV2I16Zero; + case Intrinsic::nvvm_suld_1d_v2i32_zero: + return NVPTXISD::Suld1DV2I32Zero; + case Intrinsic::nvvm_suld_1d_v2i64_zero: + return NVPTXISD::Suld1DV2I64Zero; + case Intrinsic::nvvm_suld_1d_v4i8_zero: + return NVPTXISD::Suld1DV4I8Zero; + case Intrinsic::nvvm_suld_1d_v4i16_zero: + return NVPTXISD::Suld1DV4I16Zero; + case Intrinsic::nvvm_suld_1d_v4i32_zero: + return NVPTXISD::Suld1DV4I32Zero; + case Intrinsic::nvvm_suld_1d_array_i8_zero: + return NVPTXISD::Suld1DArrayI8Zero; + case Intrinsic::nvvm_suld_1d_array_i16_zero: + return NVPTXISD::Suld1DArrayI16Zero; + case Intrinsic::nvvm_suld_1d_array_i32_zero: + return NVPTXISD::Suld1DArrayI32Zero; + case Intrinsic::nvvm_suld_1d_array_i64_zero: + return NVPTXISD::Suld1DArrayI64Zero; + case Intrinsic::nvvm_suld_1d_array_v2i8_zero: + return NVPTXISD::Suld1DArrayV2I8Zero; + case Intrinsic::nvvm_suld_1d_array_v2i16_zero: + return NVPTXISD::Suld1DArrayV2I16Zero; + case Intrinsic::nvvm_suld_1d_array_v2i32_zero: + return NVPTXISD::Suld1DArrayV2I32Zero; + case Intrinsic::nvvm_suld_1d_array_v2i64_zero: + return NVPTXISD::Suld1DArrayV2I64Zero; + case Intrinsic::nvvm_suld_1d_array_v4i8_zero: + return NVPTXISD::Suld1DArrayV4I8Zero; + case Intrinsic::nvvm_suld_1d_array_v4i16_zero: + return NVPTXISD::Suld1DArrayV4I16Zero; + case Intrinsic::nvvm_suld_1d_array_v4i32_zero: + return NVPTXISD::Suld1DArrayV4I32Zero; + case Intrinsic::nvvm_suld_2d_i8_zero: + return NVPTXISD::Suld2DI8Zero; + case Intrinsic::nvvm_suld_2d_i16_zero: + return NVPTXISD::Suld2DI16Zero; + case Intrinsic::nvvm_suld_2d_i32_zero: + return NVPTXISD::Suld2DI32Zero; + case Intrinsic::nvvm_suld_2d_i64_zero: + return NVPTXISD::Suld2DI64Zero; + case Intrinsic::nvvm_suld_2d_v2i8_zero: + return NVPTXISD::Suld2DV2I8Zero; + case Intrinsic::nvvm_suld_2d_v2i16_zero: + return NVPTXISD::Suld2DV2I16Zero; + case Intrinsic::nvvm_suld_2d_v2i32_zero: + return NVPTXISD::Suld2DV2I32Zero; + case Intrinsic::nvvm_suld_2d_v2i64_zero: + return NVPTXISD::Suld2DV2I64Zero; + case Intrinsic::nvvm_suld_2d_v4i8_zero: + return NVPTXISD::Suld2DV4I8Zero; + case Intrinsic::nvvm_suld_2d_v4i16_zero: + return NVPTXISD::Suld2DV4I16Zero; + case Intrinsic::nvvm_suld_2d_v4i32_zero: + return NVPTXISD::Suld2DV4I32Zero; + case Intrinsic::nvvm_suld_2d_array_i8_zero: + return NVPTXISD::Suld2DArrayI8Zero; + case Intrinsic::nvvm_suld_2d_array_i16_zero: + return NVPTXISD::Suld2DArrayI16Zero; + case Intrinsic::nvvm_suld_2d_array_i32_zero: + return NVPTXISD::Suld2DArrayI32Zero; + case Intrinsic::nvvm_suld_2d_array_i64_zero: + return NVPTXISD::Suld2DArrayI64Zero; + case Intrinsic::nvvm_suld_2d_array_v2i8_zero: + return NVPTXISD::Suld2DArrayV2I8Zero; + case Intrinsic::nvvm_suld_2d_array_v2i16_zero: + return NVPTXISD::Suld2DArrayV2I16Zero; + case Intrinsic::nvvm_suld_2d_array_v2i32_zero: + return NVPTXISD::Suld2DArrayV2I32Zero; + case Intrinsic::nvvm_suld_2d_array_v2i64_zero: + return NVPTXISD::Suld2DArrayV2I64Zero; + case Intrinsic::nvvm_suld_2d_array_v4i8_zero: + return NVPTXISD::Suld2DArrayV4I8Zero; + case Intrinsic::nvvm_suld_2d_array_v4i16_zero: + return NVPTXISD::Suld2DArrayV4I16Zero; + case Intrinsic::nvvm_suld_2d_array_v4i32_zero: + return NVPTXISD::Suld2DArrayV4I32Zero; + case Intrinsic::nvvm_suld_3d_i8_zero: + return NVPTXISD::Suld3DI8Zero; + case Intrinsic::nvvm_suld_3d_i16_zero: + return NVPTXISD::Suld3DI16Zero; + case Intrinsic::nvvm_suld_3d_i32_zero: + return NVPTXISD::Suld3DI32Zero; + case Intrinsic::nvvm_suld_3d_i64_zero: + return NVPTXISD::Suld3DI64Zero; + case Intrinsic::nvvm_suld_3d_v2i8_zero: + return NVPTXISD::Suld3DV2I8Zero; + case Intrinsic::nvvm_suld_3d_v2i16_zero: + return NVPTXISD::Suld3DV2I16Zero; + case Intrinsic::nvvm_suld_3d_v2i32_zero: + return NVPTXISD::Suld3DV2I32Zero; + case Intrinsic::nvvm_suld_3d_v2i64_zero: + return NVPTXISD::Suld3DV2I64Zero; + case Intrinsic::nvvm_suld_3d_v4i8_zero: + return NVPTXISD::Suld3DV4I8Zero; + case Intrinsic::nvvm_suld_3d_v4i16_zero: + return NVPTXISD::Suld3DV4I16Zero; + case Intrinsic::nvvm_suld_3d_v4i32_zero: + return NVPTXISD::Suld3DV4I32Zero; + } +} + +// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as +// TgtMemIntrinsic +// because we need the information that is only available in the "Value" type +// of destination +// pointer. In particular, the address space information. +bool NVPTXTargetLowering::getTgtMemIntrinsic( + IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const { + switch (Intrinsic) { + default: + return false; + + case Intrinsic::nvvm_atomic_load_add_f32: + case Intrinsic::nvvm_atomic_load_inc_32: + case Intrinsic::nvvm_atomic_load_dec_32: + + case Intrinsic::nvvm_atomic_add_gen_f_cta: + case Intrinsic::nvvm_atomic_add_gen_f_sys: + case Intrinsic::nvvm_atomic_add_gen_i_cta: + case Intrinsic::nvvm_atomic_add_gen_i_sys: + case Intrinsic::nvvm_atomic_and_gen_i_cta: + case Intrinsic::nvvm_atomic_and_gen_i_sys: + case Intrinsic::nvvm_atomic_cas_gen_i_cta: + case Intrinsic::nvvm_atomic_cas_gen_i_sys: + case Intrinsic::nvvm_atomic_dec_gen_i_cta: + case Intrinsic::nvvm_atomic_dec_gen_i_sys: + case Intrinsic::nvvm_atomic_inc_gen_i_cta: + case Intrinsic::nvvm_atomic_inc_gen_i_sys: + case Intrinsic::nvvm_atomic_max_gen_i_cta: + case Intrinsic::nvvm_atomic_max_gen_i_sys: + case Intrinsic::nvvm_atomic_min_gen_i_cta: + case Intrinsic::nvvm_atomic_min_gen_i_sys: + case Intrinsic::nvvm_atomic_or_gen_i_cta: + case Intrinsic::nvvm_atomic_or_gen_i_sys: + case Intrinsic::nvvm_atomic_exch_gen_i_cta: + case Intrinsic::nvvm_atomic_exch_gen_i_sys: + case Intrinsic::nvvm_atomic_xor_gen_i_cta: + case Intrinsic::nvvm_atomic_xor_gen_i_sys: { + auto &DL = I.getModule()->getDataLayout(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = getValueType(DL, I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = true; + Info.align = 0; + return true; + } + + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: { + auto &DL = I.getModule()->getDataLayout(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + if (Intrinsic == Intrinsic::nvvm_ldu_global_i) + Info.memVT = getValueType(DL, I.getType()); + else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) + Info.memVT = getPointerTy(DL); + else + Info.memVT = getValueType(DL, I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + Info.align = cast(I.getArgOperand(1))->getZExtValue(); + + return true; + } + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: { + auto &DL = I.getModule()->getDataLayout(); + + Info.opc = ISD::INTRINSIC_W_CHAIN; + if (Intrinsic == Intrinsic::nvvm_ldg_global_i) + Info.memVT = getValueType(DL, I.getType()); + else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) + Info.memVT = getPointerTy(DL); + else + Info.memVT = getValueType(DL, I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + Info.align = cast(I.getArgOperand(1))->getZExtValue(); + + return true; + } + + case Intrinsic::nvvm_tex_1d_v4f32_s32: + case Intrinsic::nvvm_tex_1d_v4f32_f32: + case Intrinsic::nvvm_tex_1d_level_v4f32_f32: + case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_1d_array_v4f32_s32: + case Intrinsic::nvvm_tex_1d_array_v4f32_f32: + case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_2d_v4f32_s32: + case Intrinsic::nvvm_tex_2d_v4f32_f32: + case Intrinsic::nvvm_tex_2d_level_v4f32_f32: + case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_2d_array_v4f32_s32: + case Intrinsic::nvvm_tex_2d_array_v4f32_f32: + case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_3d_v4f32_s32: + case Intrinsic::nvvm_tex_3d_v4f32_f32: + case Intrinsic::nvvm_tex_3d_level_v4f32_f32: + case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_cube_v4f32_f32: + case Intrinsic::nvvm_tex_cube_level_v4f32_f32: + case Intrinsic::nvvm_tex_cube_array_v4f32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: + Info.opc = getOpcForTextureInstr(Intrinsic); + Info.memVT = MVT::v4f32; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + + case Intrinsic::nvvm_tex_1d_v4s32_s32: + case Intrinsic::nvvm_tex_1d_v4s32_f32: + case Intrinsic::nvvm_tex_1d_level_v4s32_f32: + case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_v4s32_s32: + case Intrinsic::nvvm_tex_1d_array_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_2d_v4s32_s32: + case Intrinsic::nvvm_tex_2d_v4s32_f32: + case Intrinsic::nvvm_tex_2d_level_v4s32_f32: + case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_v4s32_s32: + case Intrinsic::nvvm_tex_2d_array_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_3d_v4s32_s32: + case Intrinsic::nvvm_tex_3d_v4s32_f32: + case Intrinsic::nvvm_tex_3d_level_v4s32_f32: + case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_cube_v4s32_f32: + case Intrinsic::nvvm_tex_cube_level_v4s32_f32: + case Intrinsic::nvvm_tex_cube_array_v4s32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_cube_v4u32_f32: + case Intrinsic::nvvm_tex_cube_level_v4u32_f32: + case Intrinsic::nvvm_tex_cube_array_v4u32_f32: + case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_v4u32_s32: + case Intrinsic::nvvm_tex_1d_v4u32_f32: + case Intrinsic::nvvm_tex_1d_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_v4u32_s32: + case Intrinsic::nvvm_tex_1d_array_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_2d_v4u32_s32: + case Intrinsic::nvvm_tex_2d_v4u32_f32: + case Intrinsic::nvvm_tex_2d_level_v4u32_f32: + case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_v4u32_s32: + case Intrinsic::nvvm_tex_2d_array_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_3d_v4u32_s32: + case Intrinsic::nvvm_tex_3d_v4u32_f32: + case Intrinsic::nvvm_tex_3d_level_v4u32_f32: + case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: + case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: + case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: + case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: + case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: + case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: + case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: + case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: + case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: + Info.opc = getOpcForTextureInstr(Intrinsic); + Info.memVT = MVT::v4i32; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + + case Intrinsic::nvvm_suld_1d_i8_clamp: + case Intrinsic::nvvm_suld_1d_v2i8_clamp: + case Intrinsic::nvvm_suld_1d_v4i8_clamp: + case Intrinsic::nvvm_suld_1d_array_i8_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: + case Intrinsic::nvvm_suld_2d_i8_clamp: + case Intrinsic::nvvm_suld_2d_v2i8_clamp: + case Intrinsic::nvvm_suld_2d_v4i8_clamp: + case Intrinsic::nvvm_suld_2d_array_i8_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: + case Intrinsic::nvvm_suld_3d_i8_clamp: + case Intrinsic::nvvm_suld_3d_v2i8_clamp: + case Intrinsic::nvvm_suld_3d_v4i8_clamp: + case Intrinsic::nvvm_suld_1d_i8_trap: + case Intrinsic::nvvm_suld_1d_v2i8_trap: + case Intrinsic::nvvm_suld_1d_v4i8_trap: + case Intrinsic::nvvm_suld_1d_array_i8_trap: + case Intrinsic::nvvm_suld_1d_array_v2i8_trap: + case Intrinsic::nvvm_suld_1d_array_v4i8_trap: + case Intrinsic::nvvm_suld_2d_i8_trap: + case Intrinsic::nvvm_suld_2d_v2i8_trap: + case Intrinsic::nvvm_suld_2d_v4i8_trap: + case Intrinsic::nvvm_suld_2d_array_i8_trap: + case Intrinsic::nvvm_suld_2d_array_v2i8_trap: + case Intrinsic::nvvm_suld_2d_array_v4i8_trap: + case Intrinsic::nvvm_suld_3d_i8_trap: + case Intrinsic::nvvm_suld_3d_v2i8_trap: + case Intrinsic::nvvm_suld_3d_v4i8_trap: + case Intrinsic::nvvm_suld_1d_i8_zero: + case Intrinsic::nvvm_suld_1d_v2i8_zero: + case Intrinsic::nvvm_suld_1d_v4i8_zero: + case Intrinsic::nvvm_suld_1d_array_i8_zero: + case Intrinsic::nvvm_suld_1d_array_v2i8_zero: + case Intrinsic::nvvm_suld_1d_array_v4i8_zero: + case Intrinsic::nvvm_suld_2d_i8_zero: + case Intrinsic::nvvm_suld_2d_v2i8_zero: + case Intrinsic::nvvm_suld_2d_v4i8_zero: + case Intrinsic::nvvm_suld_2d_array_i8_zero: + case Intrinsic::nvvm_suld_2d_array_v2i8_zero: + case Intrinsic::nvvm_suld_2d_array_v4i8_zero: + case Intrinsic::nvvm_suld_3d_i8_zero: + case Intrinsic::nvvm_suld_3d_v2i8_zero: + case Intrinsic::nvvm_suld_3d_v4i8_zero: + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i8; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + + case Intrinsic::nvvm_suld_1d_i16_clamp: + case Intrinsic::nvvm_suld_1d_v2i16_clamp: + case Intrinsic::nvvm_suld_1d_v4i16_clamp: + case Intrinsic::nvvm_suld_1d_array_i16_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: + case Intrinsic::nvvm_suld_2d_i16_clamp: + case Intrinsic::nvvm_suld_2d_v2i16_clamp: + case Intrinsic::nvvm_suld_2d_v4i16_clamp: + case Intrinsic::nvvm_suld_2d_array_i16_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: + case Intrinsic::nvvm_suld_3d_i16_clamp: + case Intrinsic::nvvm_suld_3d_v2i16_clamp: + case Intrinsic::nvvm_suld_3d_v4i16_clamp: + case Intrinsic::nvvm_suld_1d_i16_trap: + case Intrinsic::nvvm_suld_1d_v2i16_trap: + case Intrinsic::nvvm_suld_1d_v4i16_trap: + case Intrinsic::nvvm_suld_1d_array_i16_trap: + case Intrinsic::nvvm_suld_1d_array_v2i16_trap: + case Intrinsic::nvvm_suld_1d_array_v4i16_trap: + case Intrinsic::nvvm_suld_2d_i16_trap: + case Intrinsic::nvvm_suld_2d_v2i16_trap: + case Intrinsic::nvvm_suld_2d_v4i16_trap: + case Intrinsic::nvvm_suld_2d_array_i16_trap: + case Intrinsic::nvvm_suld_2d_array_v2i16_trap: + case Intrinsic::nvvm_suld_2d_array_v4i16_trap: + case Intrinsic::nvvm_suld_3d_i16_trap: + case Intrinsic::nvvm_suld_3d_v2i16_trap: + case Intrinsic::nvvm_suld_3d_v4i16_trap: + case Intrinsic::nvvm_suld_1d_i16_zero: + case Intrinsic::nvvm_suld_1d_v2i16_zero: + case Intrinsic::nvvm_suld_1d_v4i16_zero: + case Intrinsic::nvvm_suld_1d_array_i16_zero: + case Intrinsic::nvvm_suld_1d_array_v2i16_zero: + case Intrinsic::nvvm_suld_1d_array_v4i16_zero: + case Intrinsic::nvvm_suld_2d_i16_zero: + case Intrinsic::nvvm_suld_2d_v2i16_zero: + case Intrinsic::nvvm_suld_2d_v4i16_zero: + case Intrinsic::nvvm_suld_2d_array_i16_zero: + case Intrinsic::nvvm_suld_2d_array_v2i16_zero: + case Intrinsic::nvvm_suld_2d_array_v4i16_zero: + case Intrinsic::nvvm_suld_3d_i16_zero: + case Intrinsic::nvvm_suld_3d_v2i16_zero: + case Intrinsic::nvvm_suld_3d_v4i16_zero: + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i16; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + + case Intrinsic::nvvm_suld_1d_i32_clamp: + case Intrinsic::nvvm_suld_1d_v2i32_clamp: + case Intrinsic::nvvm_suld_1d_v4i32_clamp: + case Intrinsic::nvvm_suld_1d_array_i32_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: + case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: + case Intrinsic::nvvm_suld_2d_i32_clamp: + case Intrinsic::nvvm_suld_2d_v2i32_clamp: + case Intrinsic::nvvm_suld_2d_v4i32_clamp: + case Intrinsic::nvvm_suld_2d_array_i32_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: + case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: + case Intrinsic::nvvm_suld_3d_i32_clamp: + case Intrinsic::nvvm_suld_3d_v2i32_clamp: + case Intrinsic::nvvm_suld_3d_v4i32_clamp: + case Intrinsic::nvvm_suld_1d_i32_trap: + case Intrinsic::nvvm_suld_1d_v2i32_trap: + case Intrinsic::nvvm_suld_1d_v4i32_trap: + case Intrinsic::nvvm_suld_1d_array_i32_trap: + case Intrinsic::nvvm_suld_1d_array_v2i32_trap: + case Intrinsic::nvvm_suld_1d_array_v4i32_trap: + case Intrinsic::nvvm_suld_2d_i32_trap: + case Intrinsic::nvvm_suld_2d_v2i32_trap: + case Intrinsic::nvvm_suld_2d_v4i32_trap: + case Intrinsic::nvvm_suld_2d_array_i32_trap: + case Intrinsic::nvvm_suld_2d_array_v2i32_trap: + case Intrinsic::nvvm_suld_2d_array_v4i32_trap: + case Intrinsic::nvvm_suld_3d_i32_trap: + case Intrinsic::nvvm_suld_3d_v2i32_trap: + case Intrinsic::nvvm_suld_3d_v4i32_trap: + case Intrinsic::nvvm_suld_1d_i32_zero: + case Intrinsic::nvvm_suld_1d_v2i32_zero: + case Intrinsic::nvvm_suld_1d_v4i32_zero: + case Intrinsic::nvvm_suld_1d_array_i32_zero: + case Intrinsic::nvvm_suld_1d_array_v2i32_zero: + case Intrinsic::nvvm_suld_1d_array_v4i32_zero: + case Intrinsic::nvvm_suld_2d_i32_zero: + case Intrinsic::nvvm_suld_2d_v2i32_zero: + case Intrinsic::nvvm_suld_2d_v4i32_zero: + case Intrinsic::nvvm_suld_2d_array_i32_zero: + case Intrinsic::nvvm_suld_2d_array_v2i32_zero: + case Intrinsic::nvvm_suld_2d_array_v4i32_zero: + case Intrinsic::nvvm_suld_3d_i32_zero: + case Intrinsic::nvvm_suld_3d_v2i32_zero: + case Intrinsic::nvvm_suld_3d_v4i32_zero: + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i32; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + + case Intrinsic::nvvm_suld_1d_i64_clamp: + case Intrinsic::nvvm_suld_1d_v2i64_clamp: + case Intrinsic::nvvm_suld_1d_array_i64_clamp: + case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: + case Intrinsic::nvvm_suld_2d_i64_clamp: + case Intrinsic::nvvm_suld_2d_v2i64_clamp: + case Intrinsic::nvvm_suld_2d_array_i64_clamp: + case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: + case Intrinsic::nvvm_suld_3d_i64_clamp: + case Intrinsic::nvvm_suld_3d_v2i64_clamp: + case Intrinsic::nvvm_suld_1d_i64_trap: + case Intrinsic::nvvm_suld_1d_v2i64_trap: + case Intrinsic::nvvm_suld_1d_array_i64_trap: + case Intrinsic::nvvm_suld_1d_array_v2i64_trap: + case Intrinsic::nvvm_suld_2d_i64_trap: + case Intrinsic::nvvm_suld_2d_v2i64_trap: + case Intrinsic::nvvm_suld_2d_array_i64_trap: + case Intrinsic::nvvm_suld_2d_array_v2i64_trap: + case Intrinsic::nvvm_suld_3d_i64_trap: + case Intrinsic::nvvm_suld_3d_v2i64_trap: + case Intrinsic::nvvm_suld_1d_i64_zero: + case Intrinsic::nvvm_suld_1d_v2i64_zero: + case Intrinsic::nvvm_suld_1d_array_i64_zero: + case Intrinsic::nvvm_suld_1d_array_v2i64_zero: + case Intrinsic::nvvm_suld_2d_i64_zero: + case Intrinsic::nvvm_suld_2d_v2i64_zero: + case Intrinsic::nvvm_suld_2d_array_i64_zero: + case Intrinsic::nvvm_suld_2d_array_v2i64_zero: + case Intrinsic::nvvm_suld_3d_i64_zero: + case Intrinsic::nvvm_suld_3d_v2i64_zero: + Info.opc = getOpcForSurfaceInstr(Intrinsic); + Info.memVT = MVT::i64; + Info.ptrVal = nullptr; + Info.offset = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + Info.align = 16; + return true; + } + return false; +} + +/// isLegalAddressingMode - Return true if the addressing mode represented +/// by AM is legal for this target, for a load/store of the specified type. +/// Used to guide target specific optimizations, like loop strength reduction +/// (LoopStrengthReduce.cpp) and memory optimization for address mode +/// (CodeGenPrepare.cpp) +bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { + // AddrMode - This represents an addressing mode of: + // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + // + // The legal address modes are + // - [avar] + // - [areg] + // - [areg+immoff] + // - [immAddr] + + if (AM.BaseGV) { + return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; + } + + switch (AM.Scale) { + case 0: // "r", "r+i" or "i" is allowed + break; + case 1: + if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. + return false; + // Otherwise we have r+i. + break; + default: + // No scale > 1 is allowed + return false; + } + return true; +} + +//===----------------------------------------------------------------------===// +// NVPTX Inline Assembly Support +//===----------------------------------------------------------------------===// + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +NVPTXTargetLowering::ConstraintType +NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: + break; + case 'b': + case 'r': + case 'h': + case 'c': + case 'l': + case 'f': + case 'd': + case '0': + case 'N': + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +std::pair +NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, + MVT VT) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'b': + return std::make_pair(0U, &NVPTX::Int1RegsRegClass); + case 'c': + return std::make_pair(0U, &NVPTX::Int16RegsRegClass); + case 'h': + return std::make_pair(0U, &NVPTX::Int16RegsRegClass); + case 'r': + return std::make_pair(0U, &NVPTX::Int32RegsRegClass); + case 'l': + case 'N': + return std::make_pair(0U, &NVPTX::Int64RegsRegClass); + case 'f': + return std::make_pair(0U, &NVPTX::Float32RegsRegClass); + case 'd': + return std::make_pair(0U, &NVPTX::Float64RegsRegClass); + } + } + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); +} + +//===----------------------------------------------------------------------===// +// NVPTX DAG Combining +//===----------------------------------------------------------------------===// + +bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, + CodeGenOpt::Level OptLevel) const { + // Always honor command-line argument + if (FMAContractLevelOpt.getNumOccurrences() > 0) + return FMAContractLevelOpt > 0; + + // Do not contract if we're not optimizing the code. + if (OptLevel == 0) + return false; + + // Honor TargetOptions flags that explicitly say fusion is okay. + if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) + return true; + + return allowUnsafeFPMath(MF); +} + +bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { + // Honor TargetOptions flags that explicitly say unsafe math is okay. + if (MF.getTarget().Options.UnsafeFPMath) + return true; + + // Allow unsafe math if unsafe-fp-math attribute explicitly says so. + const Function *F = MF.getFunction(); + if (F->hasFnAttribute("unsafe-fp-math")) { + Attribute Attr = F->getFnAttribute("unsafe-fp-math"); + StringRef Val = Attr.getValueAsString(); + if (Val == "true") + return true; + } + + return false; +} + +/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with +/// operands N0 and N1. This is a helper for PerformADDCombine that is +/// called with the default operands, and if that fails, with commuted +/// operands. +static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SelectionDAG &DAG = DCI.DAG; + // Skip non-integer, non-scalar case + EVT VT=N0.getValueType(); + if (VT.isVector()) + return SDValue(); + + // fold (add (mul a, b), c) -> (mad a, b, c) + // + if (N0.getOpcode() == ISD::MUL) { + assert (VT.isInteger()); + // For integer: + // Since integer multiply-add costs the same as integer multiply + // but is more costly than integer add, do the fusion only when + // the mul is only used in the add. + if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || + !N0.getNode()->hasOneUse()) + return SDValue(); + + // Do the folding + return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + else if (N0.getOpcode() == ISD::FMUL) { + if (VT == MVT::f32 || VT == MVT::f64) { + const auto *TLI = static_cast( + &DAG.getTargetLoweringInfo()); + if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) + return SDValue(); + + // For floating point: + // Do the fusion only when the mul has less than 5 uses and all + // are add. + // The heuristic is that if a use is not an add, then that use + // cannot be fused into fma, therefore mul is still needed anyway. + // If there are more than 4 uses, even if they are all add, fusing + // them will increase register pressue. + // + int numUses = 0; + int nonAddCount = 0; + for (SDNode::use_iterator UI = N0.getNode()->use_begin(), + UE = N0.getNode()->use_end(); + UI != UE; ++UI) { + numUses++; + SDNode *User = *UI; + if (User->getOpcode() != ISD::FADD) + ++nonAddCount; + } + if (numUses >= 5) + return SDValue(); + if (nonAddCount) { + int orderNo = N->getIROrder(); + int orderNo2 = N0.getNode()->getIROrder(); + // simple heuristics here for considering potential register + // pressure, the logics here is that the differnce are used + // to measure the distance between def and use, the longer distance + // more likely cause register pressure. + if (orderNo - orderNo2 < 500) + return SDValue(); + + // Now, check if at least one of the FMUL's operands is live beyond the node N, + // which guarantees that the FMA will not increase register pressure at node N. + bool opIsLive = false; + const SDNode *left = N0.getOperand(0).getNode(); + const SDNode *right = N0.getOperand(1).getNode(); + + if (isa(left) || isa(right)) + opIsLive = true; + + if (!opIsLive) + for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + int orderNo3 = User->getIROrder(); + if (orderNo3 > orderNo) { + opIsLive = true; + break; + } + } + + if (!opIsLive) + return SDValue(); + } + + return DAG.getNode(ISD::FMA, SDLoc(N), VT, + N0.getOperand(0), N0.getOperand(1), N1); + } + } + + return SDValue(); +} + +/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. +/// +static SDValue PerformADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const NVPTXSubtarget &Subtarget, + CodeGenOpt::Level OptLevel) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // First try with the default operand order. + if (SDValue Result = + PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) + return Result; + + // If that didn't work, try again with the operands commuted. + return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); +} + +static SDValue PerformANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // The type legalizer turns a vector load of i8 values into a zextload to i16 + // registers, optionally ANY_EXTENDs it (if target type is integer), + // and ANDs off the high 8 bits. Since we turn this load into a + // target-specific DAG node, the DAG combiner fails to eliminate these AND + // nodes. Do that here. + SDValue Val = N->getOperand(0); + SDValue Mask = N->getOperand(1); + + if (isa(Val)) { + std::swap(Val, Mask); + } + + SDValue AExt; + // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and + if (Val.getOpcode() == ISD::ANY_EXTEND) { + AExt = Val; + Val = Val->getOperand(0); + } + + if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { + Val = Val->getOperand(0); + } + + if (Val->getOpcode() == NVPTXISD::LoadV2 || + Val->getOpcode() == NVPTXISD::LoadV4) { + ConstantSDNode *MaskCnst = dyn_cast(Mask); + if (!MaskCnst) { + // Not an AND with a constant + return SDValue(); + } + + uint64_t MaskVal = MaskCnst->getZExtValue(); + if (MaskVal != 0xff) { + // Not an AND that chops off top 8 bits + return SDValue(); + } + + MemSDNode *Mem = dyn_cast(Val); + if (!Mem) { + // Not a MemSDNode?!? + return SDValue(); + } + + EVT MemVT = Mem->getMemoryVT(); + if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { + // We only handle the i8 case + return SDValue(); + } + + unsigned ExtType = + cast(Val->getOperand(Val->getNumOperands()-1))-> + getZExtValue(); + if (ExtType == ISD::SEXTLOAD) { + // If for some reason the load is a sextload, the and is needed to zero + // out the high 8 bits + return SDValue(); + } + + bool AddTo = false; + if (AExt.getNode() != nullptr) { + // Re-insert the ext as a zext. + Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), + AExt.getValueType(), Val); + AddTo = true; + } + + // If we get here, the AND is unnecessary. Just replace it with the load + DCI.CombineTo(N, Val, AddTo); + } + + return SDValue(); +} + +static SDValue PerformREMCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); + + // Don't do anything at less than -O2. + if (OptLevel < CodeGenOpt::Default) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + EVT VT = N->getValueType(0); + bool IsSigned = N->getOpcode() == ISD::SREM; + unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; + + const SDValue &Num = N->getOperand(0); + const SDValue &Den = N->getOperand(1); + + for (const SDNode *U : Num->uses()) { + if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && + U->getOperand(1) == Den) { + // Num % Den -> Num - (Num / Den) * Den + return DAG.getNode(ISD::SUB, DL, VT, Num, + DAG.getNode(ISD::MUL, DL, VT, + DAG.getNode(DivOpc, DL, VT, Num, Den), + Den)); + } + } + return SDValue(); +} + +enum OperandSignedness { + Signed = 0, + Unsigned, + Unknown +}; + +/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand +/// that can be demoted to \p OptSize bits without loss of information. The +/// signedness of the operand, if determinable, is placed in \p S. +static bool IsMulWideOperandDemotable(SDValue Op, + unsigned OptSize, + OperandSignedness &S) { + S = Unknown; + + if (Op.getOpcode() == ISD::SIGN_EXTEND || + Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() <= OptSize) { + S = Signed; + return true; + } + } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { + EVT OrigVT = Op.getOperand(0).getValueType(); + if (OrigVT.getSizeInBits() <= OptSize) { + S = Unsigned; + return true; + } + } + + return false; +} + +/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can +/// be demoted to \p OptSize bits without loss of information. If the operands +/// contain a constant, it should appear as the RHS operand. The signedness of +/// the operands is placed in \p IsSigned. +static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, + unsigned OptSize, + bool &IsSigned) { + OperandSignedness LHSSign; + + // The LHS operand must be a demotable op + if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) + return false; + + // We should have been able to determine the signedness from the LHS + if (LHSSign == Unknown) + return false; + + IsSigned = (LHSSign == Signed); + + // The RHS can be a demotable op or a constant + if (ConstantSDNode *CI = dyn_cast(RHS)) { + const APInt &Val = CI->getAPIntValue(); + if (LHSSign == Unsigned) { + return Val.isIntN(OptSize); + } else { + return Val.isSignedIntN(OptSize); + } + } else { + OperandSignedness RHSSign; + if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) + return false; + + return LHSSign == RHSSign; + } +} + +/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply +/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform +/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift +/// amount. +static SDValue TryMULWIDECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT MulType = N->getValueType(0); + if (MulType != MVT::i32 && MulType != MVT::i64) { + return SDValue(); + } + + SDLoc DL(N); + unsigned OptSize = MulType.getSizeInBits() >> 1; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Canonicalize the multiply so the constant (if any) is on the right + if (N->getOpcode() == ISD::MUL) { + if (isa(LHS)) { + std::swap(LHS, RHS); + } + } + + // If we have a SHL, determine the actual multiply amount + if (N->getOpcode() == ISD::SHL) { + ConstantSDNode *ShlRHS = dyn_cast(RHS); + if (!ShlRHS) { + return SDValue(); + } + + APInt ShiftAmt = ShlRHS->getAPIntValue(); + unsigned BitWidth = MulType.getSizeInBits(); + if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { + APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; + RHS = DCI.DAG.getConstant(MulVal, DL, MulType); + } else { + return SDValue(); + } + } + + bool Signed; + // Verify that our operands are demotable + if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { + return SDValue(); + } + + EVT DemotedVT; + if (MulType == MVT::i32) { + DemotedVT = MVT::i16; + } else { + DemotedVT = MVT::i32; + } + + // Truncate the operands to the correct size. Note that these are just for + // type consistency and will (likely) be eliminated in later phases. + SDValue TruncLHS = + DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); + SDValue TruncRHS = + DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); + + unsigned Opc; + if (Signed) { + Opc = NVPTXISD::MUL_WIDE_SIGNED; + } else { + Opc = NVPTXISD::MUL_WIDE_UNSIGNED; + } + + return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); +} + +/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. +static SDValue PerformMULCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + if (SDValue Ret = TryMULWIDECombine(N, DCI)) + return Ret; + } + + return SDValue(); +} + +/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. +static SDValue PerformSHLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + CodeGenOpt::Level OptLevel) { + if (OptLevel > 0) { + // Try mul.wide combining at OptLevel > 0 + if (SDValue Ret = TryMULWIDECombine(N, DCI)) + return Ret; + } + + return SDValue(); +} + +static SDValue PerformSETCCCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT CCType = N->getValueType(0); + SDValue A = N->getOperand(0); + SDValue B = N->getOperand(1); + + if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) + return SDValue(); + + SDLoc DL(N); + // setp.f16x2 returns two scalar predicates, which we need to + // convert back to v2i1. The returned result will be scalarized by + // the legalizer, but the comparison will remain a single vector + // instruction. + SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, + DCI.DAG.getVTList(MVT::i1, MVT::i1), + {A, B, N->getOperand(2)}); + return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), + CCNode.getValue(1)); +} + +SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); + switch (N->getOpcode()) { + default: break; + case ISD::ADD: + case ISD::FADD: + return PerformADDCombine(N, DCI, STI, OptLevel); + case ISD::MUL: + return PerformMULCombine(N, DCI, OptLevel); + case ISD::SHL: + return PerformSHLCombine(N, DCI, OptLevel); + case ISD::AND: + return PerformANDCombine(N, DCI); + case ISD::UREM: + case ISD::SREM: + return PerformREMCombine(N, DCI, OptLevel); + case ISD::SETCC: + return PerformSETCCCombine(N, DCI); + } + return SDValue(); +} + +/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. +static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, + SmallVectorImpl &Results) { + EVT ResVT = N->getValueType(0); + SDLoc DL(N); + + assert(ResVT.isVector() && "Vector load must have vector type"); + + // We only handle "native" vector sizes for now, e.g. <4 x double> is not + // legal. We can (and should) split that into 2 loads of <2 x double> here + // but I'm leaving that as a TODO for now. + assert(ResVT.isSimple() && "Can only handle simple types"); + switch (ResVT.getSimpleVT().SimpleTy) { + default: + return; + case MVT::v2i8: + case MVT::v2i16: + case MVT::v2i32: + case MVT::v2i64: + case MVT::v2f16: + case MVT::v2f32: + case MVT::v2f64: + case MVT::v4i8: + case MVT::v4i16: + case MVT::v4i32: + case MVT::v4f16: + case MVT::v4f32: + case MVT::v8f16: // <4 x f16x2> + // This is a "native" vector type + break; + } + + LoadSDNode *LD = cast(N); + + unsigned Align = LD->getAlignment(); + auto &TD = DAG.getDataLayout(); + unsigned PrefAlign = + TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext())); + if (Align < PrefAlign) { + // This load is not sufficiently aligned, so bail out and let this vector + // load be scalarized. Note that we may still be able to emit smaller + // vector loads. For example, if we are loading a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return; + } + + EVT EltVT = ResVT.getVectorElementType(); + unsigned NumElts = ResVT.getVectorNumElements(); + + // Since LoadV2 is a target node, we cannot rely on DAG type legalization. + // Therefore, we must ensure the type is legal. For i1 and i8, we set the + // loaded type to i16 and propagate the "real" type as the memory type. + bool NeedTrunc = false; + if (EltVT.getSizeInBits() < 16) { + EltVT = MVT::i16; + NeedTrunc = true; + } + + unsigned Opcode = 0; + SDVTList LdResVTs; + bool LoadF16x2 = false; + + switch (NumElts) { + default: + return; + case 2: + Opcode = NVPTXISD::LoadV2; + LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); + break; + case 4: { + Opcode = NVPTXISD::LoadV4; + EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; + LdResVTs = DAG.getVTList(ListVTs); + break; + } + case 8: { + // v8f16 is a special case. PTX doesn't have ld.v8.f16 + // instruction. Instead, we split the vector into v2f16 chunks and + // load them with ld.v4.b32. + assert(EltVT == MVT::f16 && "Unsupported v8 vector type."); + LoadF16x2 = true; + Opcode = NVPTXISD::LoadV4; + EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16, + MVT::Other}; + LdResVTs = DAG.getVTList(ListVTs); + break; + } + } + + // Copy regular operands + SmallVector OtherOps(N->op_begin(), N->op_end()); + + // The select routine does not have access to the LoadSDNode instance, so + // pass along the extension information + OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); + + SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, + LD->getMemoryVT(), + LD->getMemOperand()); + + SmallVector ScalarRes; + if (LoadF16x2) { + // Split v2f16 subvectors back into individual elements. + NumElts /= 2; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue SubVector = NewLD.getValue(i); + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, + DAG.getIntPtrConstant(0, DL)); + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, + DAG.getIntPtrConstant(1, DL)); + ScalarRes.push_back(E0); + ScalarRes.push_back(E1); + } + } else { + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Res = NewLD.getValue(i); + if (NeedTrunc) + Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); + ScalarRes.push_back(Res); + } + } + + SDValue LoadChain = NewLD.getValue(NumElts); + + SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); + + Results.push_back(BuildVec); + Results.push_back(LoadChain); +} + +static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, + SmallVectorImpl &Results) { + SDValue Chain = N->getOperand(0); + SDValue Intrin = N->getOperand(1); + SDLoc DL(N); + + // Get the intrinsic ID + unsigned IntrinNo = cast(Intrin.getNode())->getZExtValue(); + switch (IntrinNo) { + default: + return; + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: { + EVT ResVT = N->getValueType(0); + + if (ResVT.isVector()) { + // Vector LDG/LDU + + unsigned NumElts = ResVT.getVectorNumElements(); + EVT EltVT = ResVT.getVectorElementType(); + + // Since LDU/LDG are target nodes, we cannot rely on DAG type + // legalization. + // Therefore, we must ensure the type is legal. For i1 and i8, we set the + // loaded type to i16 and propagate the "real" type as the memory type. + bool NeedTrunc = false; + if (EltVT.getSizeInBits() < 16) { + EltVT = MVT::i16; + NeedTrunc = true; + } + + unsigned Opcode = 0; + SDVTList LdResVTs; + + switch (NumElts) { + default: + return; + case 2: + switch (IntrinNo) { + default: + return; + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: + Opcode = NVPTXISD::LDGV2; + break; + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: + Opcode = NVPTXISD::LDUV2; + break; + } + LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); + break; + case 4: { + switch (IntrinNo) { + default: + return; + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: + Opcode = NVPTXISD::LDGV4; + break; + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: + Opcode = NVPTXISD::LDUV4; + break; + } + EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; + LdResVTs = DAG.getVTList(ListVTs); + break; + } + } + + SmallVector OtherOps; + + // Copy regular operands + + OtherOps.push_back(Chain); // Chain + // Skip operand 1 (intrinsic ID) + // Others + OtherOps.append(N->op_begin() + 2, N->op_end()); + + MemIntrinsicSDNode *MemSD = cast(N); + + SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, + MemSD->getMemoryVT(), + MemSD->getMemOperand()); + + SmallVector ScalarRes; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Res = NewLD.getValue(i); + if (NeedTrunc) + Res = + DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); + ScalarRes.push_back(Res); + } + + SDValue LoadChain = NewLD.getValue(NumElts); + + SDValue BuildVec = + DAG.getBuildVector(ResVT, DL, ScalarRes); + + Results.push_back(BuildVec); + Results.push_back(LoadChain); + } else { + // i8 LDG/LDU + assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && + "Custom handling of non-i8 ldu/ldg?"); + + // Just copy all operands as-is + SmallVector Ops(N->op_begin(), N->op_end()); + + // Force output to i16 + SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); + + MemIntrinsicSDNode *MemSD = cast(N); + + // We make sure the memory type is i8, which will be used during isel + // to select the proper instruction. + SDValue NewLD = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, + MVT::i8, MemSD->getMemOperand()); + + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, + NewLD.getValue(0))); + Results.push_back(NewLD.getValue(1)); + } + } + } +} + +void NVPTXTargetLowering::ReplaceNodeResults( + SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { + switch (N->getOpcode()) { + default: + report_fatal_error("Unhandled custom legalization"); + case ISD::LOAD: + ReplaceLoadVector(N, DAG, Results); + return; + case ISD::INTRINSIC_W_CHAIN: + ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); + return; + } +} + +// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file. +void NVPTXSection::anchor() {} + +NVPTXTargetObjectFile::~NVPTXTargetObjectFile() { + delete static_cast(TextSection); + delete static_cast(DataSection); + delete static_cast(BSSSection); + delete static_cast(ReadOnlySection); + + delete static_cast(StaticCtorSection); + delete static_cast(StaticDtorSection); + delete static_cast(LSDASection); + delete static_cast(EHFrameSection); + delete static_cast(DwarfAbbrevSection); + delete static_cast(DwarfInfoSection); + delete static_cast(DwarfLineSection); + delete static_cast(DwarfFrameSection); + delete static_cast(DwarfPubTypesSection); + delete static_cast(DwarfDebugInlineSection); + delete static_cast(DwarfStrSection); + delete static_cast(DwarfLocSection); + delete static_cast(DwarfARangesSection); + delete static_cast(DwarfRangesSection); + delete static_cast(DwarfMacinfoSection); +} + +MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( + const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { + return getDataSection(); +} Index: llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td +++ llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1,3165 +1,3164 @@ -//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file describes the PTX instructions in TableGen format. -// -//===----------------------------------------------------------------------===// - -include "NVPTXInstrFormats.td" - -// A NOP instruction -let hasSideEffects = 0 in { - def NOP : NVPTXInst<(outs), (ins), "", []>; -} - -let OperandType = "OPERAND_IMMEDIATE" in { - def f16imm : Operand; -} - -// List of vector specific properties -def isVecLD : VecInstTypeEnum<1>; -def isVecST : VecInstTypeEnum<2>; -def isVecBuild : VecInstTypeEnum<3>; -def isVecShuffle : VecInstTypeEnum<4>; -def isVecExtract : VecInstTypeEnum<5>; -def isVecInsert : VecInstTypeEnum<6>; -def isVecDest : VecInstTypeEnum<7>; -def isVecOther : VecInstTypeEnum<15>; - -//===----------------------------------------------------------------------===// -// NVPTX Operand Definitions. -//===----------------------------------------------------------------------===// - -def brtarget : Operand; - -// CVT conversion modes -// These must match the enum in NVPTX.h -def CvtNONE : PatLeaf<(i32 0x0)>; -def CvtRNI : PatLeaf<(i32 0x1)>; -def CvtRZI : PatLeaf<(i32 0x2)>; -def CvtRMI : PatLeaf<(i32 0x3)>; -def CvtRPI : PatLeaf<(i32 0x4)>; -def CvtRN : PatLeaf<(i32 0x5)>; -def CvtRZ : PatLeaf<(i32 0x6)>; -def CvtRM : PatLeaf<(i32 0x7)>; -def CvtRP : PatLeaf<(i32 0x8)>; - -def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; -def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; -def CvtRZI_FTZ : PatLeaf<(i32 0x12)>; -def CvtRMI_FTZ : PatLeaf<(i32 0x13)>; -def CvtRPI_FTZ : PatLeaf<(i32 0x14)>; -def CvtRN_FTZ : PatLeaf<(i32 0x15)>; -def CvtRZ_FTZ : PatLeaf<(i32 0x16)>; -def CvtRM_FTZ : PatLeaf<(i32 0x17)>; -def CvtRP_FTZ : PatLeaf<(i32 0x18)>; - -def CvtSAT : PatLeaf<(i32 0x20)>; -def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; - -def CvtMode : Operand { - let PrintMethod = "printCvtMode"; -} - -// Compare modes -// These must match the enum in NVPTX.h -def CmpEQ : PatLeaf<(i32 0)>; -def CmpNE : PatLeaf<(i32 1)>; -def CmpLT : PatLeaf<(i32 2)>; -def CmpLE : PatLeaf<(i32 3)>; -def CmpGT : PatLeaf<(i32 4)>; -def CmpGE : PatLeaf<(i32 5)>; -def CmpEQU : PatLeaf<(i32 10)>; -def CmpNEU : PatLeaf<(i32 11)>; -def CmpLTU : PatLeaf<(i32 12)>; -def CmpLEU : PatLeaf<(i32 13)>; -def CmpGTU : PatLeaf<(i32 14)>; -def CmpGEU : PatLeaf<(i32 15)>; -def CmpNUM : PatLeaf<(i32 16)>; -def CmpNAN : PatLeaf<(i32 17)>; - -def CmpEQ_FTZ : PatLeaf<(i32 0x100)>; -def CmpNE_FTZ : PatLeaf<(i32 0x101)>; -def CmpLT_FTZ : PatLeaf<(i32 0x102)>; -def CmpLE_FTZ : PatLeaf<(i32 0x103)>; -def CmpGT_FTZ : PatLeaf<(i32 0x104)>; -def CmpGE_FTZ : PatLeaf<(i32 0x105)>; -def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>; -def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>; -def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>; -def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>; -def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>; -def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>; -def CmpNUM_FTZ : PatLeaf<(i32 0x110)>; -def CmpNAN_FTZ : PatLeaf<(i32 0x111)>; - -def CmpMode : Operand { - let PrintMethod = "printCmpMode"; -} -def VecElement : Operand { - let PrintMethod = "printVecElement"; -} - -//===----------------------------------------------------------------------===// -// NVPTX Instruction Predicate Definitions -//===----------------------------------------------------------------------===// - - -def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">; -def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">; -def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">; -def useAtomRedG32forGen32 : - Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">; -def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">; -def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">; -def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">; -def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">; -def useAtomRedG64forGen64 : - Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">; -def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">; -def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; -def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; -def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; -def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; -def hasVote : Predicate<"Subtarget->hasVote()">; -def hasDouble : Predicate<"Subtarget->hasDouble()">; -def reqPTX20 : Predicate<"Subtarget->reqPTX20()">; -def hasLDG : Predicate<"Subtarget->hasLDG()">; -def hasLDU : Predicate<"Subtarget->hasLDU()">; -def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">; - -def doF32FTZ : Predicate<"useF32FTZ()">; -def doNoF32FTZ : Predicate<"!useF32FTZ()">; - -def doMulWide : Predicate<"doMulWide">; - -def allowFMA : Predicate<"allowFMA()">; -def noFMA : Predicate<"!allowFMA()">; -def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">; - -def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; -def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; - -def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; -def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; - -def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; -def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; - -def true : Predicate<"true">; - -def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; - -def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; - -//===----------------------------------------------------------------------===// -// Some Common Instruction Class Templates -//===----------------------------------------------------------------------===// - -// Template for instructions which take three int64, int32, or int16 args. -// The instructions are named "" (e.g. "add.s64"). -multiclass I3 { - def i64rr : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; - def i64ri : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; - def i32rr : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; - def i32ri : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; - def i16rr : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; - def i16ri : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; -} - -// Template for instructions which take 3 int32 args. The instructions are -// named ".s32" (e.g. "addc.cc.s32"). -multiclass ADD_SUB_INT_32 { - def i32rr : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; - def i32ri : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; -} - -// Template for instructions which take three fp64 or fp32 args. The -// instructions are named ".f" (e.g. "min.f64"). -// -// Also defines ftz (flush subnormal inputs and results to sign-preserving -// zero) variants for fp32 functions. -// -// This multiclass should be used for nodes that cannot be folded into FMAs. -// For nodes that can be folded into FMAs (i.e. adds and muls), use -// F3_fma_component. -multiclass F3 { - def f64rr : - NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, Float64Regs:$b), - !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; - def f64ri : - NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, f64imm:$b), - !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; - def f32rr_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[doF32FTZ]>; - def f32ri_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[doF32FTZ]>; - def f32rr : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; - def f32ri : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; -} - -// Template for instructions which take three FP args. The -// instructions are named ".f" (e.g. "add.f64"). -// -// Also defines ftz (flush subnormal inputs and results to sign-preserving -// zero) variants for fp32/fp16 functions. -// -// This multiclass should be used for nodes that can be folded to make fma ops. -// In this case, we use the ".rn" variant when FMA is disabled, as this behaves -// just like the non ".rn" op, but prevents ptxas from creating FMAs. -multiclass F3_fma_component { - def f64rr : - NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, Float64Regs:$b), - !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, - Requires<[allowFMA]>; - def f64ri : - NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, f64imm:$b), - !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, - Requires<[allowFMA]>; - def f32rr_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[allowFMA, doF32FTZ]>; - def f32ri_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[allowFMA, doF32FTZ]>; - def f32rr : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[allowFMA]>; - def f32ri : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[allowFMA]>; - - def f16rr_ftz : - NVPTXInst<(outs Float16Regs:$dst), - (ins Float16Regs:$a, Float16Regs:$b), - !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), - [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, - Requires<[useFP16Math, allowFMA, doF32FTZ]>; - def f16rr : - NVPTXInst<(outs Float16Regs:$dst), - (ins Float16Regs:$a, Float16Regs:$b), - !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), - [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, - Requires<[useFP16Math, allowFMA]>; - - def f16x2rr_ftz : - NVPTXInst<(outs Float16x2Regs:$dst), - (ins Float16x2Regs:$a, Float16x2Regs:$b), - !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), - [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, - Requires<[useFP16Math, allowFMA, doF32FTZ]>; - def f16x2rr : - NVPTXInst<(outs Float16x2Regs:$dst), - (ins Float16x2Regs:$a, Float16x2Regs:$b), - !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), - [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, - Requires<[useFP16Math, allowFMA]>; - - // These have strange names so we don't perturb existing mir tests. - def _rnf64rr : - NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, Float64Regs:$b), - !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, - Requires<[noFMA]>; - def _rnf64ri : - NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, f64imm:$b), - !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, - Requires<[noFMA]>; - def _rnf32rr_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[noFMA, doF32FTZ]>; - def _rnf32ri_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[noFMA, doF32FTZ]>; - def _rnf32rr : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, - Requires<[noFMA]>; - def _rnf32ri : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, - Requires<[noFMA]>; - def _rnf16rr_ftz : - NVPTXInst<(outs Float16Regs:$dst), - (ins Float16Regs:$a, Float16Regs:$b), - !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"), - [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, - Requires<[useFP16Math, noFMA, doF32FTZ]>; - def _rnf16rr : - NVPTXInst<(outs Float16Regs:$dst), - (ins Float16Regs:$a, Float16Regs:$b), - !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), - [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, - Requires<[useFP16Math, noFMA]>; - def _rnf16x2rr_ftz : - NVPTXInst<(outs Float16x2Regs:$dst), - (ins Float16x2Regs:$a, Float16x2Regs:$b), - !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), - [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, - Requires<[useFP16Math, noFMA, doF32FTZ]>; - def _rnf16x2rr : - NVPTXInst<(outs Float16x2Regs:$dst), - (ins Float16x2Regs:$a, Float16x2Regs:$b), - !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), - [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, - Requires<[useFP16Math, noFMA]>; -} - -// Template for operations which take two f32 or f64 operands. Provides three -// instructions: .f64, .f32, and .ftz.f32 (flush -// subnormal inputs and results to zero). -multiclass F2 { - def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), - !strconcat(OpcStr, ".f64 \t$dst, $a;"), - [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; - def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), - !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, - Requires<[doF32FTZ]>; - def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), - !strconcat(OpcStr, ".f32 \t$dst, $a;"), - [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; -} - -//===----------------------------------------------------------------------===// -// NVPTX Instructions. -//===----------------------------------------------------------------------===// - -//----------------------------------- -// Type Conversion -//----------------------------------- - -let hasSideEffects = 0 in { - // Generate a cvt to the given type from all possible types. Each instance - // takes a CvtMode immediate that defines the conversion mode to use. It can - // be CvtNONE to omit a conversion mode. - multiclass CVT_FROM_ALL { - def _s8 : - NVPTXInst<(outs RC:$dst), - (ins Int16Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".s8 \t$dst, $src;"), []>; - def _u8 : - NVPTXInst<(outs RC:$dst), - (ins Int16Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".u8 \t$dst, $src;"), []>; - def _s16 : - NVPTXInst<(outs RC:$dst), - (ins Int16Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".s16 \t$dst, $src;"), []>; - def _u16 : - NVPTXInst<(outs RC:$dst), - (ins Int16Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".u16 \t$dst, $src;"), []>; - def _s32 : - NVPTXInst<(outs RC:$dst), - (ins Int32Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".s32 \t$dst, $src;"), []>; - def _u32 : - NVPTXInst<(outs RC:$dst), - (ins Int32Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".u32 \t$dst, $src;"), []>; - def _s64 : - NVPTXInst<(outs RC:$dst), - (ins Int64Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".s64 \t$dst, $src;"), []>; - def _u64 : - NVPTXInst<(outs RC:$dst), - (ins Int64Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".u64 \t$dst, $src;"), []>; - def _f16 : - NVPTXInst<(outs RC:$dst), - (ins Float16Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".f16 \t$dst, $src;"), []>; - def _f32 : - NVPTXInst<(outs RC:$dst), - (ins Float32Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".f32 \t$dst, $src;"), []>; - def _f64 : - NVPTXInst<(outs RC:$dst), - (ins Float64Regs:$src, CvtMode:$mode), - !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", - FromName, ".f64 \t$dst, $src;"), []>; - } - - // Generate cvts from all types to all types. - defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; - defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; - defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; - defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; - defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; - defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; - defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; - defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; - defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>; - defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; - defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; - - // These cvts are different from those above: The source and dest registers - // are of the same type. - def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), - "cvt.s16.s8 \t$dst, $src;", []>; - def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), - "cvt.s32.s8 \t$dst, $src;", []>; - def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), - "cvt.s32.s16 \t$dst, $src;", []>; - def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "cvt.s64.s8 \t$dst, $src;", []>; - def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "cvt.s64.s16 \t$dst, $src;", []>; - def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "cvt.s64.s32 \t$dst, $src;", []>; -} - -//----------------------------------- -// Integer Arithmetic -//----------------------------------- - -// Template for xor masquerading as int1 arithmetic. -multiclass ADD_SUB_i1 { - def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), - "xor.pred \t$dst, $a, $b;", - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; - def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), - "xor.pred \t$dst, $a, $b;", - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; -} - -// int1 addition and subtraction are both just xor. -defm ADD_i1 : ADD_SUB_i1; -defm SUB_i1 : ADD_SUB_i1; - -// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we -// also use these for unsigned arithmetic. -defm ADD : I3<"add.s", add>; -defm SUB : I3<"sub.s", sub>; - -// int32 addition and subtraction with carry-out. -// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?). -defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>; -defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>; - -// int32 addition and subtraction with carry-in and carry-out. -defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>; -defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>; - -defm MULT : I3<"mul.lo.s", mul>; - -defm MULTHS : I3<"mul.hi.s", mulhs>; -defm MULTHU : I3<"mul.hi.u", mulhu>; - -defm SDIV : I3<"div.s", sdiv>; -defm UDIV : I3<"div.u", udiv>; - -// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM -// will lower it. -defm SREM : I3<"rem.s", srem>; -defm UREM : I3<"rem.u", urem>; - -// Integer absolute value. NumBits should be one minus the bit width of RC. -// This idiom implements the algorithm at -// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs. -multiclass ABS { - def : NVPTXInst<(outs RC:$dst), (ins RC:$a), - !strconcat("abs", SizeName, " \t$dst, $a;"), - [(set RC:$dst, (xor (add (sra RC:$a, (i32 NumBits)), RC:$a), - (sra RC:$a, (i32 NumBits))))]>; -} -defm ABS_16 : ABS; -defm ABS_32 : ABS; -defm ABS_64 : ABS; - -// Integer min/max. -defm SMAX : I3<"max.s", smax>; -defm UMAX : I3<"max.u", umax>; -defm SMIN : I3<"min.s", smin>; -defm UMIN : I3<"min.u", umin>; - -// -// Wide multiplication -// -def MULWIDES64 : - NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDES64Imm : - NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - "mul.wide.s32 \t$dst, $a, $b;", []>; -def MULWIDES64Imm64 : - NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), - "mul.wide.s32 \t$dst, $a, $b;", []>; - -def MULWIDEU64 : - NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDEU64Imm : - NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - "mul.wide.u32 \t$dst, $a, $b;", []>; -def MULWIDEU64Imm64 : - NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), - "mul.wide.u32 \t$dst, $a, $b;", []>; - -def MULWIDES32 : - NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDES32Imm : - NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - "mul.wide.s16 \t$dst, $a, $b;", []>; -def MULWIDES32Imm32 : - NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), - "mul.wide.s16 \t$dst, $a, $b;", []>; - -def MULWIDEU32 : - NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - "mul.wide.u16 \t$dst, $a, $b;", []>; -def MULWIDEU32Imm : - NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - "mul.wide.u16 \t$dst, $a, $b;", []>; -def MULWIDEU32Imm32 : - NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), - "mul.wide.u16 \t$dst, $a, $b;", []>; - -def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; -def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; -def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; - -// Matchers for signed, unsigned mul.wide ISD nodes. -def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)), - (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, - Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), - (MULWIDES32Imm Int16Regs:$a, imm:$b)>, - Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)), - (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, - Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), - (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, - Requires<[doMulWide]>; - -def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)), - (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), - (MULWIDES64Imm Int32Regs:$a, imm:$b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)), - (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), - (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, - Requires<[doMulWide]>; - -// Predicates used for converting some patterns to mul.wide. -def SInt32Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isSignedIntN(32); -}]>; - -def UInt32Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isIntN(32); -}]>; - -def SInt16Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isSignedIntN(16); -}]>; - -def UInt16Const : PatLeaf<(imm), [{ - const APInt &v = N->getAPIntValue(); - return v.isIntN(16); -}]>; - -def Int5Const : PatLeaf<(imm), [{ - // Check if 0 <= v < 32; only then will the result of (x << v) be an int32. - const APInt &v = N->getAPIntValue(); - return v.sge(0) && v.slt(32); -}]>; - -def Int4Const : PatLeaf<(imm), [{ - // Check if 0 <= v < 16; only then will the result of (x << v) be an int16. - const APInt &v = N->getAPIntValue(); - return v.sge(0) && v.slt(16); -}]>; - -def SHL2MUL32 : SDNodeXFormgetAPIntValue(); - APInt temp(32, 1); - return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); -}]>; - -def SHL2MUL16 : SDNodeXFormgetAPIntValue(); - APInt temp(16, 1); - return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); -}]>; - -// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. -def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)), - (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, - Requires<[doMulWide]>; -def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)), - (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, - Requires<[doMulWide]>; - -def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)), - (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, - Requires<[doMulWide]>; -def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)), - (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, - Requires<[doMulWide]>; - -// Convert "sign/zero-extend then multiply" to mul.wide. -def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), - (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, - Requires<[doMulWide]>; -def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), - (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, - Requires<[doMulWide]>; - -def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), - (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, - Requires<[doMulWide]>; -def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), - (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, - Requires<[doMulWide]>; - -def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), - (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, - Requires<[doMulWide]>; -def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), - (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, - Requires<[doMulWide]>; - -def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), - (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, - Requires<[doMulWide]>; -def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), - (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, - Requires<[doMulWide]>; - -// -// Integer multiply-add -// -def SDTIMAD : - SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, - SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; -def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; - -def MAD16rrr : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; -def MAD16rri : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; -def MAD16rir : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; -def MAD16rii : - NVPTXInst<(outs Int16Regs:$dst), - (ins Int16Regs:$a, i16imm:$b, i16imm:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>; - -def MAD32rrr : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; -def MAD32rri : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; -def MAD32rir : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; -def MAD32rii : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, i32imm:$b, i32imm:$c), - "mad.lo.s32 \t$dst, $a, $b, $c;", - [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>; - -def MAD64rrr : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; -def MAD64rri : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; -def MAD64rir : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; -def MAD64rii : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$a, i64imm:$b, i64imm:$c), - "mad.lo.s64 \t$dst, $a, $b, $c;", - [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>; - -def INEG16 : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), - "neg.s16 \t$dst, $src;", - [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; -def INEG32 : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), - "neg.s32 \t$dst, $src;", - [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>; -def INEG64 : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "neg.s64 \t$dst, $src;", - [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; - -//----------------------------------- -// Floating Point Arithmetic -//----------------------------------- - -// Constant 1.0f -def FloatConst1 : PatLeaf<(fpimm), [{ - return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() && - N->getValueAPF().convertToFloat() == 1.0f; -}]>; -// Constant 1.0 (double) -def DoubleConst1 : PatLeaf<(fpimm), [{ - return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && - N->getValueAPF().convertToDouble() == 1.0; -}]>; - -// Loads FP16 constant into a register. -// -// ptxas does not have hex representation for fp16, so we can't use -// fp16 immediate values in .f16 instructions. Instead we have to load -// the constant into a register using mov.b16. -def LOAD_CONST_F16 : - NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a), - "mov.b16 \t$dst, $a;", []>; - -defm FADD : F3_fma_component<"add", fadd>; -defm FSUB : F3_fma_component<"sub", fsub>; -defm FMUL : F3_fma_component<"mul", fmul>; - -defm FMIN : F3<"min", fminnum>; -defm FMAX : F3<"max", fmaxnum>; - -defm FABS : F2<"abs", fabs>; -defm FNEG : F2<"neg", fneg>; -defm FSQRT : F2<"sqrt.rn", fsqrt>; - -// -// F64 division -// -def FDIV641r : - NVPTXInst<(outs Float64Regs:$dst), - (ins f64imm:$a, Float64Regs:$b), - "rcp.rn.f64 \t$dst, $b;", - [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>; -def FDIV64rr : - NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, Float64Regs:$b), - "div.rn.f64 \t$dst, $a, $b;", - [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>; -def FDIV64ri : - NVPTXInst<(outs Float64Regs:$dst), - (ins Float64Regs:$a, f64imm:$b), - "div.rn.f64 \t$dst, $a, $b;", - [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; - -// -// F32 Approximate reciprocal -// -def FDIV321r_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.approx.ftz.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_APPROX, doF32FTZ]>; -def FDIV321r : - NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.approx.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_APPROX]>; -// -// F32 Approximate division -// -def FDIV32approxrr_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.approx.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_APPROX, doF32FTZ]>; -def FDIV32approxri_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.approx.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[do_DIVF32_APPROX, doF32FTZ]>; -def FDIV32approxrr : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.approx.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_APPROX]>; -def FDIV32approxri : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.approx.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[do_DIVF32_APPROX]>; -// -// F32 Semi-accurate reciprocal -// -// rcp.approx gives the same result as div.full(1.0f, a) and is faster. -// -def FDIV321r_approx_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.approx.ftz.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_FULL, doF32FTZ]>; -def FDIV321r_approx : - NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.approx.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_FULL]>; -// -// F32 Semi-accurate division -// -def FDIV32rr_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.full.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_FULL, doF32FTZ]>; -def FDIV32ri_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.full.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[do_DIVF32_FULL, doF32FTZ]>; -def FDIV32rr : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.full.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[do_DIVF32_FULL]>; -def FDIV32ri : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.full.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[do_DIVF32_FULL]>; -// -// F32 Accurate reciprocal -// -def FDIV321r_prec_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.rn.ftz.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[reqPTX20, doF32FTZ]>; -def FDIV321r_prec : - NVPTXInst<(outs Float32Regs:$dst), - (ins f32imm:$a, Float32Regs:$b), - "rcp.rn.f32 \t$dst, $b;", - [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, - Requires<[reqPTX20]>; -// -// F32 Accurate division -// -def FDIV32rr_prec_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.rn.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[doF32FTZ, reqPTX20]>; -def FDIV32ri_prec_ftz : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.rn.ftz.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[doF32FTZ, reqPTX20]>; -def FDIV32rr_prec : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, Float32Regs:$b), - "div.rn.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, - Requires<[reqPTX20]>; -def FDIV32ri_prec : - NVPTXInst<(outs Float32Regs:$dst), - (ins Float32Regs:$a, f32imm:$b), - "div.rn.f32 \t$dst, $a, $b;", - [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, - Requires<[reqPTX20]>; - -// -// FMA -// - -multiclass FMA { - def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, - Requires<[Pred]>; - def rri : NVPTXInst<(outs RC:$dst), - (ins RC:$a, RC:$b, ImmCls:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, - Requires<[Pred]>; - def rir : NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, RC:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, - Requires<[Pred]>; - def rii : NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, ImmCls:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, - Requires<[Pred]>; -} - -multiclass FMA_F16 { - def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, - Requires<[useFP16Math, Pred]>; -} - -defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>; -defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, true>; -defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>; -defm FMA16x2 : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>; -defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; -defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, true>; -defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, true>; - -// sin/cos -def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), - "sin.approx.f32 \t$dst, $src;", - [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>, - Requires<[allowUnsafeFPMath]>; -def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), - "cos.approx.f32 \t$dst, $src;", - [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>, - Requires<[allowUnsafeFPMath]>; - -// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)), -// i.e. "poor man's fmod()" - -// frem - f32 FTZ -def : Pat<(frem Float32Regs:$x, Float32Regs:$y), - (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 - (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ), - Float32Regs:$y))>, - Requires<[doF32FTZ]>; -def : Pat<(frem Float32Regs:$x, fpimm:$y), - (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 - (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ), - fpimm:$y))>, - Requires<[doF32FTZ]>; - -// frem - f32 -def : Pat<(frem Float32Regs:$x, Float32Regs:$y), - (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 - (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI), - Float32Regs:$y))>; -def : Pat<(frem Float32Regs:$x, fpimm:$y), - (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 - (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI), - fpimm:$y))>; - -// frem - f64 -def : Pat<(frem Float64Regs:$x, Float64Regs:$y), - (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 - (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI), - Float64Regs:$y))>; -def : Pat<(frem Float64Regs:$x, fpimm:$y), - (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 - (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI), - fpimm:$y))>; - -//----------------------------------- -// Bitwise operations -//----------------------------------- - -// Template for three-arg bitwise operations. Takes three args, Creates .b16, -// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. -multiclass BITWISE { - def b1rr : - NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), - !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; - def b1ri : - NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), - !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), - [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; - def b16rr : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; - def b16ri : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; - def b32rr : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; - def b32ri : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; - def b64rr : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; - def b64ri : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; -} - -defm OR : BITWISE<"or", or>; -defm AND : BITWISE<"and", and>; -defm XOR : BITWISE<"xor", xor>; - -def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), - "not.pred \t$dst, $src;", - [(set Int1Regs:$dst, (not Int1Regs:$src))]>; -def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), - "not.b16 \t$dst, $src;", - [(set Int16Regs:$dst, (not Int16Regs:$src))]>; -def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), - "not.b32 \t$dst, $src;", - [(set Int32Regs:$dst, (not Int32Regs:$src))]>; -def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "not.b64 \t$dst, $src;", - [(set Int64Regs:$dst, (not Int64Regs:$src))]>; - -// Template for left/right shifts. Takes three operands, -// [dest (reg), src (reg), shift (reg or imm)]. -// dest and src may be int64, int32, or int16, but shift is always int32. -// -// This template also defines a 32-bit shift (imm, imm) instruction. -multiclass SHIFT { - def i64rr : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>; - def i64ri : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>; - def i32rr : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; - def i32ri : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>; - def i32ii : - NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; - def i16rr : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>; - def i16ri : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; -} - -defm SHL : SHIFT<"shl.b", shl>; -defm SRA : SHIFT<"shr.s", sra>; -defm SRL : SHIFT<"shr.u", srl>; - -// Bit-reverse -def BREV32 : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), - "brev.b32 \t$dst, $a;", - [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>; -def BREV64 : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a), - "brev.b64 \t$dst, $a;", - [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; - -// -// Rotate: Use ptx shf instruction if available. -// - -// 32 bit r2 = rotl r1, n -// => -// r2 = shf.l r1, r1, n -def ROTL32imm_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), - "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, - Requires<[hasHWROT32]>; - -def ROTL32reg_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, - Requires<[hasHWROT32]>; - -// 32 bit r2 = rotr r1, n -// => -// r2 = shf.r r1, r1, n -def ROTR32imm_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), - "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, - Requires<[hasHWROT32]>; - -def ROTR32reg_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, - Requires<[hasHWROT32]>; - -// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. -def ROT32imm_sw : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), - "{{\n\t" - ".reg .b32 %lhs;\n\t" - ".reg .b32 %rhs;\n\t" - "shl.b32 \t%lhs, $src, $amt1;\n\t" - "shr.b32 \t%rhs, $src, $amt2;\n\t" - "add.u32 \t$dst, %lhs, %rhs;\n\t" - "}}", - []>; - -def SUB_FRM_32 : SDNodeXFormgetTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); -}]>; - -def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, - Requires<[noHWROT32]>; -def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, - Requires<[noHWROT32]>; - -// 32-bit software rotate left by register. -def ROTL32reg_sw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b32 %lhs;\n\t" - ".reg .b32 %rhs;\n\t" - ".reg .b32 %amt2;\n\t" - "shl.b32 \t%lhs, $src, $amt;\n\t" - "sub.s32 \t%amt2, 32, $amt;\n\t" - "shr.b32 \t%rhs, $src, %amt2;\n\t" - "add.u32 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, - Requires<[noHWROT32]>; - -// 32-bit software rotate right by register. -def ROTR32reg_sw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b32 %lhs;\n\t" - ".reg .b32 %rhs;\n\t" - ".reg .b32 %amt2;\n\t" - "shr.b32 \t%lhs, $src, $amt;\n\t" - "sub.s32 \t%amt2, 32, $amt;\n\t" - "shl.b32 \t%rhs, $src, %amt2;\n\t" - "add.u32 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, - Requires<[noHWROT32]>; - -// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. -def ROT64imm_sw : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), - "{{\n\t" - ".reg .b64 %lhs;\n\t" - ".reg .b64 %rhs;\n\t" - "shl.b64 \t%lhs, $src, $amt1;\n\t" - "shr.b64 \t%rhs, $src, $amt2;\n\t" - "add.u64 \t$dst, %lhs, %rhs;\n\t" - "}}", - []>; - -def SUB_FRM_64 : SDNodeXFormgetTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); -}]>; - -def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), - (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; -def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), - (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; - -// 64-bit software rotate left by register. -def ROTL64reg_sw : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b64 %lhs;\n\t" - ".reg .b64 %rhs;\n\t" - ".reg .u32 %amt2;\n\t" - "shl.b64 \t%lhs, $src, $amt;\n\t" - "sub.u32 \t%amt2, 64, $amt;\n\t" - "shr.b64 \t%rhs, $src, %amt2;\n\t" - "add.u64 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>; - -def ROTR64reg_sw : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b64 %lhs;\n\t" - ".reg .b64 %rhs;\n\t" - ".reg .u32 %amt2;\n\t" - "shr.b64 \t%lhs, $src, $amt;\n\t" - "sub.u32 \t%amt2, 64, $amt;\n\t" - "shl.b64 \t%rhs, $src, %amt2;\n\t" - "add.u64 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; - -// -// Funnnel shift in clamp mode -// - -// Create SDNodes so they can be used in the DAG code, e.g. -// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) -def SDTIntShiftDOp : - SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, - SDTCisInt<0>, SDTCisInt<3>]>; -def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; -def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; - -def FUNSHFLCLAMP : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", - [(set Int32Regs:$dst, - (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; - -def FUNSHFRCLAMP : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", - [(set Int32Regs:$dst, - (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; - -// -// BFE - bit-field extract -// - -// Template for BFE instructions. Takes four args, -// [dest (reg), src (reg), start (reg or imm), end (reg or imm)]. -// Start may be an imm only if end is also an imm. FIXME: Is this a -// restriction in PTX? -// -// dest and src may be int32 or int64, but start and end are always int32. -multiclass BFE { - def rrr - : NVPTXInst<(outs RC:$d), - (ins RC:$a, Int32Regs:$b, Int32Regs:$c), - !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; - def rri - : NVPTXInst<(outs RC:$d), - (ins RC:$a, Int32Regs:$b, i32imm:$c), - !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; - def rii - : NVPTXInst<(outs RC:$d), - (ins RC:$a, i32imm:$b, i32imm:$c), - !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; -} - -let hasSideEffects = 0 in { - defm BFE_S32 : BFE<"s32", Int32Regs>; - defm BFE_U32 : BFE<"u32", Int32Regs>; - defm BFE_S64 : BFE<"s64", Int64Regs>; - defm BFE_U64 : BFE<"u64", Int64Regs>; -} - -//----------------------------------- -// Comparison instructions (setp, set) -//----------------------------------- - -// FIXME: This doesn't cover versions of set and setp that combine with a -// boolean predicate, e.g. setp.eq.and.b16. - -let hasSideEffects = 0 in { - multiclass SETP { - def rr : - NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp), - !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, - " \t$dst, $a, $b;"), []>; - def ri : - NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp), - !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, - " \t$dst, $a, $b;"), []>; - def ir : - NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp), - !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, - " \t$dst, $a, $b;"), []>; - } -} - -defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>; -defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>; -defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>; -defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>; -defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>; -defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>; -defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>; -defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>; -defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>; -defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>; -defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; -def SETP_f16rr : - NVPTXInst<(outs Int1Regs:$dst), - (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp), - "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;", - []>, Requires<[useFP16Math]>; - -def SETP_f16x2rr : - NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), - (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp), - "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;", - []>, - Requires<[useFP16Math]>; - - -// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form -// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination -// reg, either u32, s32, or f32. Anyway these aren't used at the moment. - -let hasSideEffects = 0 in { - multiclass SET { - def rr : NVPTXInst<(outs Int32Regs:$dst), - (ins RC:$a, RC:$b, CmpMode:$cmp), - !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; - def ri : NVPTXInst<(outs Int32Regs:$dst), - (ins RC:$a, ImmCls:$b, CmpMode:$cmp), - !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; - def ir : NVPTXInst<(outs Int32Regs:$dst), - (ins ImmCls:$a, RC:$b, CmpMode:$cmp), - !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; - } -} - -defm SET_b16 : SET<"b16", Int16Regs, i16imm>; -defm SET_s16 : SET<"s16", Int16Regs, i16imm>; -defm SET_u16 : SET<"u16", Int16Regs, i16imm>; -defm SET_b32 : SET<"b32", Int32Regs, i32imm>; -defm SET_s32 : SET<"s32", Int32Regs, i32imm>; -defm SET_u32 : SET<"u32", Int32Regs, i32imm>; -defm SET_b64 : SET<"b64", Int64Regs, i64imm>; -defm SET_s64 : SET<"s64", Int64Regs, i64imm>; -defm SET_u64 : SET<"u64", Int64Regs, i64imm>; -defm SET_f16 : SET<"f16", Float16Regs, f16imm>; -defm SET_f32 : SET<"f32", Float32Regs, f32imm>; -defm SET_f64 : SET<"f64", Float64Regs, f64imm>; - -//----------------------------------- -// Selection instructions (selp) -//----------------------------------- - -// FIXME: Missing slct - -// selp instructions that don't have any pattern matches; we explicitly use -// them within this file. -let hasSideEffects = 0 in { - multiclass SELP { - def rr : NVPTXInst<(outs RC:$dst), - (ins RC:$a, RC:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; - def ri : NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; - def ir : NVPTXInst<(outs RC:$dst), - (ins ImmCls:$a, RC:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; - def ii : NVPTXInst<(outs RC:$dst), - (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; - } - - multiclass SELP_PATTERN { - def rr : - NVPTXInst<(outs RC:$dst), - (ins RC:$a, RC:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), - [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>; - def ri : - NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), - [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>; - def ir : - NVPTXInst<(outs RC:$dst), - (ins ImmCls:$a, RC:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), - [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>; - def ii : - NVPTXInst<(outs RC:$dst), - (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), - !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), - [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; - } -} - -// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as -// good. -defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>; -defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>; -defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>; -defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>; -defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>; -defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>; -defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>; -defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>; -defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>; -defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>; -defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>; -defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>; - -def SELP_f16x2rr : - NVPTXInst<(outs Float16x2Regs:$dst), - (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p), - "selp.b32 \t$dst, $a, $b, $p;", - [(set Float16x2Regs:$dst, - (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>; - -//----------------------------------- -// Data Movement (Load / Store, Move) -//----------------------------------- - -def ADDRri : ComplexPattern; -def ADDRri64 : ComplexPattern; - -def MEMri : Operand { - let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops Int32Regs, i32imm); -} -def MEMri64 : Operand { - let PrintMethod = "printMemOperand"; - let MIOperandInfo = (ops Int64Regs, i64imm); -} - -def imem : Operand { - let PrintMethod = "printOperand"; -} - -def imemAny : Operand { - let PrintMethod = "printOperand"; -} - -def LdStCode : Operand { - let PrintMethod = "printLdStCode"; -} - -def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; -def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; - -// Load a memory address into a u32 or u64 register. -def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), - "mov.u32 \t$dst, $a;", - [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; -def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), - "mov.u64 \t$dst, $a;", - [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; - -// Get pointer to local stack. -let hasSideEffects = 0 in { - def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), - "mov.u32 \t$d, __local_depot$num;", []>; - def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), - "mov.u64 \t$d, __local_depot$num;", []>; -} - - -// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp -let IsSimpleMove=1, hasSideEffects=0 in { - def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), - "mov.pred \t$dst, $sss;", []>; - def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), - "mov.u16 \t$dst, $sss;", []>; - def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), - "mov.u32 \t$dst, $sss;", []>; - def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), - "mov.u64 \t$dst, $sss;", []>; - - def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src), - // We have to use .b16 here as there's no mov.f16. - "mov.b16 \t$dst, $src;", []>; - def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), - "mov.f32 \t$dst, $src;", []>; - def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), - "mov.f64 \t$dst, $src;", []>; -} - -def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), - "mov.pred \t$dst, $src;", - [(set Int1Regs:$dst, imm:$src)]>; -def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), - "mov.u16 \t$dst, $src;", - [(set Int16Regs:$dst, imm:$src)]>; -def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), - "mov.u32 \t$dst, $src;", - [(set Int32Regs:$dst, imm:$src)]>; -def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), - "mov.u64 \t$dst, $src;", - [(set Int64Regs:$dst, imm:$src)]>; - -def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), - "mov.f32 \t$dst, $src;", - [(set Float32Regs:$dst, fpimm:$src)]>; -def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), - "mov.f64 \t$dst, $src;", - [(set Float64Regs:$dst, fpimm:$src)]>; - -def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; - -//---- Copy Frame Index ---- -def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), - "add.u32 \t$dst, ${addr:add};", - [(set Int32Regs:$dst, ADDRri:$addr)]>; -def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), - "add.u64 \t$dst, ${addr:add};", - [(set Int64Regs:$dst, ADDRri64:$addr)]>; - -//----------------------------------- -// Comparison and Selection -//----------------------------------- - -multiclass ISET_FORMAT { - // i16 -> pred - def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)), - (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; - def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)), - (setp_16ri Int16Regs:$a, imm:$b, Mode)>; - def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)), - (setp_16ir imm:$a, Int16Regs:$b, Mode)>; - // i32 -> pred - def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)), - (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; - def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)), - (setp_32ri Int32Regs:$a, imm:$b, Mode)>; - def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)), - (setp_32ir imm:$a, Int32Regs:$b, Mode)>; - // i64 -> pred - def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)), - (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; - def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)), - (setp_64ri Int64Regs:$a, imm:$b, Mode)>; - def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)), - (setp_64ir imm:$a, Int64Regs:$b, Mode)>; - - // i16 -> i32 - def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)), - (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; - def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)), - (set_16ri Int16Regs:$a, imm:$b, Mode)>; - def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)), - (set_16ir imm:$a, Int16Regs:$b, Mode)>; - // i32 -> i32 - def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)), - (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; - def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)), - (set_32ri Int32Regs:$a, imm:$b, Mode)>; - def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)), - (set_32ir imm:$a, Int32Regs:$b, Mode)>; - // i64 -> i32 - def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)), - (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; - def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)), - (set_64ri Int64Regs:$a, imm:$b, Mode)>; - def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)), - (set_64ir imm:$a, Int64Regs:$b, Mode)>; -} - -multiclass ISET_FORMAT_SIGNED - : ISET_FORMAT { - // TableGen doesn't like empty multiclasses. - def : PatLeaf<(i32 0)>; -} - -multiclass ISET_FORMAT_UNSIGNED - : ISET_FORMAT { - // TableGen doesn't like empty multiclasses. - def : PatLeaf<(i32 0)>; -} - -defm : ISET_FORMAT_SIGNED; -defm : ISET_FORMAT_SIGNED; -defm : ISET_FORMAT_SIGNED; -defm : ISET_FORMAT_SIGNED; -defm : ISET_FORMAT_SIGNED; -defm : ISET_FORMAT_SIGNED; -defm : ISET_FORMAT_UNSIGNED; -defm : ISET_FORMAT_UNSIGNED; -defm : ISET_FORMAT_UNSIGNED; -defm : ISET_FORMAT_UNSIGNED; -defm : ISET_FORMAT_UNSIGNED; -defm : ISET_FORMAT_UNSIGNED; - -// i1 compares -def : Pat<(setne Int1Regs:$a, Int1Regs:$b), - (XORb1rr Int1Regs:$a, Int1Regs:$b)>; -def : Pat<(setune Int1Regs:$a, Int1Regs:$b), - (XORb1rr Int1Regs:$a, Int1Regs:$b)>; - -def : Pat<(seteq Int1Regs:$a, Int1Regs:$b), - (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; -def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), - (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; - -// i1 compare -> i32 -def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), - (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; -def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), - (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; - - - -multiclass FSET_FORMAT { - // f16 -> pred - def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)), - (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>, - Requires<[useFP16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)), - (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>, - Requires<[useFP16Math]>; - def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)), - (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, - Requires<[useFP16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)), - (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, - Requires<[useFP16Math]>; - def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)), - (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>, - Requires<[useFP16Math,doF32FTZ]>; - def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)), - (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>, - Requires<[useFP16Math]>; - - // f32 -> pred - def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), - (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, - Requires<[doF32FTZ]>; - def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), - (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; - def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), - (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, - Requires<[doF32FTZ]>; - def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), - (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>; - def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), - (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, - Requires<[doF32FTZ]>; - def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), - (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>; - - // f64 -> pred - def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)), - (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; - def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)), - (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>; - def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)), - (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>; - - // f16 -> i32 - def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)), - (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>, - Requires<[useFP16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)), - (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>, - Requires<[useFP16Math]>; - def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)), - (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, - Requires<[useFP16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)), - (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, - Requires<[useFP16Math]>; - def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)), - (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>, - Requires<[useFP16Math, doF32FTZ]>; - def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)), - (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>, - Requires<[useFP16Math]>; - - // f32 -> i32 - def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), - (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, - Requires<[doF32FTZ]>; - def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), - (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; - def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), - (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, - Requires<[doF32FTZ]>; - def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), - (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>; - def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), - (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, - Requires<[doF32FTZ]>; - def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), - (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>; - - // f64 -> i32 - def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)), - (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; - def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)), - (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>; - def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)), - (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>; -} - -defm FSetOGT : FSET_FORMAT; -defm FSetOLT : FSET_FORMAT; -defm FSetOGE : FSET_FORMAT; -defm FSetOLE : FSET_FORMAT; -defm FSetOEQ : FSET_FORMAT; -defm FSetONE : FSET_FORMAT; - -defm FSetUGT : FSET_FORMAT; -defm FSetULT : FSET_FORMAT; -defm FSetUGE : FSET_FORMAT; -defm FSetULE : FSET_FORMAT; -defm FSetUEQ : FSET_FORMAT; -defm FSetUNE : FSET_FORMAT; - -defm FSetGT : FSET_FORMAT; -defm FSetLT : FSET_FORMAT; -defm FSetGE : FSET_FORMAT; -defm FSetLE : FSET_FORMAT; -defm FSetEQ : FSET_FORMAT; -defm FSetNE : FSET_FORMAT; - -defm FSetNUM : FSET_FORMAT; -defm FSetNAN : FSET_FORMAT; - -// FIXME: What is this doing here? Can it be deleted? -// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad, -// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; - -def SDTDeclareParamProfile : - SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; -def SDTDeclareScalarParamProfile : - SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; -def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; -def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; -def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; -def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; -def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>; -def SDTCallVoidProfile : SDTypeProfile<0, 1, []>; -def SDTCallValProfile : SDTypeProfile<1, 0, []>; -def SDTMoveParamProfile : SDTypeProfile<1, 1, []>; -def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; -def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>; -def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>; -def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; - -def DeclareParam : - SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def DeclareScalarParam : - SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def DeclareRetParam : - SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def DeclareRet : - SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def LoadParam : - SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV2 : - SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV4 : - SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def PrintCall : - SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def PrintConvergentCall : - SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def PrintCallUni : - SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def PrintConvergentCallUni : - SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParam : - SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV2 : - SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV4 : - SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamU32 : - SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamS32 : - SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def CallArgBegin : - SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def CallArg : - SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def LastCallArg : - SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def CallArgEnd : - SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def CallVoid : - SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def Prototype : - SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def CallVal : - SDNode<"NVPTXISD::CallVal", SDTCallValProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def MoveParam : - SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; -def StoreRetval : - SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, - [SDNPHasChain, SDNPSideEffect]>; -def StoreRetvalV2 : - SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, - [SDNPHasChain, SDNPSideEffect]>; -def StoreRetvalV4 : - SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, - [SDNPHasChain, SDNPSideEffect]>; -def PseudoUseParam : - SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def RETURNNode : - SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, - [SDNPHasChain, SDNPSideEffect]>; - -let mayLoad = 1 in { - class LoadParamMemInst : - NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), - !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"), - []>; - - class LoadParamV2MemInst : - NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), - !strconcat("ld.param.v2", opstr, - " \t{{$dst, $dst2}}, [retval0+$b];"), []>; - - class LoadParamV4MemInst : - NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, - regclass:$dst4), - (ins i32imm:$b), - !strconcat("ld.param.v4", opstr, - " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), - []>; -} - -class LoadParamRegInst : - NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), - !strconcat("mov", opstr, " \t$dst, retval$b;"), - [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; - -let mayStore = 1 in { - class StoreParamInst : - NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), - !strconcat("st.param", opstr, " \t[param$a+$b], $val;"), - []>; - - class StoreParamV2Inst : - NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, - i32imm:$a, i32imm:$b), - !strconcat("st.param.v2", opstr, - " \t[param$a+$b], {{$val, $val2}};"), - []>; - - class StoreParamV4Inst : - NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3, - regclass:$val4, i32imm:$a, - i32imm:$b), - !strconcat("st.param.v4", opstr, - " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"), - []>; - - class StoreRetvalInst : - NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), - !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"), - []>; - - class StoreRetvalV2Inst : - NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), - !strconcat("st.param.v2", opstr, - " \t[func_retval0+$a], {{$val, $val2}};"), - []>; - - class StoreRetvalV4Inst : - NVPTXInst<(outs), - (ins regclass:$val, regclass:$val2, regclass:$val3, - regclass:$val4, i32imm:$a), - !strconcat("st.param.v4", opstr, - " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), - []>; -} - -let isCall=1 in { - multiclass CALL { - def PrintCallNoRetInst : NVPTXInst<(outs), (ins), - !strconcat(OpcStr, " "), [(OpNode (i32 0))]>; - def PrintCallRetInst1 : NVPTXInst<(outs), (ins), - !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>; - def PrintCallRetInst2 : NVPTXInst<(outs), (ins), - !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>; - def PrintCallRetInst3 : NVPTXInst<(outs), (ins), - !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>; - def PrintCallRetInst4 : NVPTXInst<(outs), (ins), - !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "), - [(OpNode (i32 4))]>; - def PrintCallRetInst5 : NVPTXInst<(outs), (ins), - !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "), - [(OpNode (i32 5))]>; - def PrintCallRetInst6 : NVPTXInst<(outs), (ins), - !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " - "retval5), "), - [(OpNode (i32 6))]>; - def PrintCallRetInst7 : NVPTXInst<(outs), (ins), - !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " - "retval5, retval6), "), - [(OpNode (i32 7))]>; - def PrintCallRetInst8 : NVPTXInst<(outs), (ins), - !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " - "retval5, retval6, retval7), "), - [(OpNode (i32 8))]>; - } -} - -defm Call : CALL<"call", PrintCall>; -defm CallUni : CALL<"call.uni", PrintCallUni>; - -// Convergent call instructions. These are identical to regular calls, except -// they have the isConvergent bit set. -let isConvergent=1 in { - defm ConvergentCall : CALL<"call", PrintConvergentCall>; - defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>; -} - -def LoadParamMemI64 : LoadParamMemInst; -def LoadParamMemI32 : LoadParamMemInst; -def LoadParamMemI16 : LoadParamMemInst; -def LoadParamMemI8 : LoadParamMemInst; -def LoadParamMemV2I64 : LoadParamV2MemInst; -def LoadParamMemV2I32 : LoadParamV2MemInst; -def LoadParamMemV2I16 : LoadParamV2MemInst; -def LoadParamMemV2I8 : LoadParamV2MemInst; -def LoadParamMemV4I32 : LoadParamV4MemInst; -def LoadParamMemV4I16 : LoadParamV4MemInst; -def LoadParamMemV4I8 : LoadParamV4MemInst; -def LoadParamMemF16 : LoadParamMemInst; -def LoadParamMemF16x2 : LoadParamMemInst; -def LoadParamMemF32 : LoadParamMemInst; -def LoadParamMemF64 : LoadParamMemInst; -def LoadParamMemV2F16 : LoadParamV2MemInst; -def LoadParamMemV2F16x2: LoadParamV2MemInst; -def LoadParamMemV2F32 : LoadParamV2MemInst; -def LoadParamMemV2F64 : LoadParamV2MemInst; -def LoadParamMemV4F16 : LoadParamV4MemInst; -def LoadParamMemV4F16x2: LoadParamV4MemInst; -def LoadParamMemV4F32 : LoadParamV4MemInst; - -def StoreParamI64 : StoreParamInst; -def StoreParamI32 : StoreParamInst; - -def StoreParamI16 : StoreParamInst; -def StoreParamI8 : StoreParamInst; -def StoreParamV2I64 : StoreParamV2Inst; -def StoreParamV2I32 : StoreParamV2Inst; -def StoreParamV2I16 : StoreParamV2Inst; -def StoreParamV2I8 : StoreParamV2Inst; - -def StoreParamV4I32 : StoreParamV4Inst; -def StoreParamV4I16 : StoreParamV4Inst; -def StoreParamV4I8 : StoreParamV4Inst; - -def StoreParamF16 : StoreParamInst; -def StoreParamF16x2 : StoreParamInst; -def StoreParamF32 : StoreParamInst; -def StoreParamF64 : StoreParamInst; -def StoreParamV2F16 : StoreParamV2Inst; -def StoreParamV2F16x2 : StoreParamV2Inst; -def StoreParamV2F32 : StoreParamV2Inst; -def StoreParamV2F64 : StoreParamV2Inst; -def StoreParamV4F16 : StoreParamV4Inst; -def StoreParamV4F16x2 : StoreParamV4Inst; -def StoreParamV4F32 : StoreParamV4Inst; - -def StoreRetvalI64 : StoreRetvalInst; -def StoreRetvalI32 : StoreRetvalInst; -def StoreRetvalI16 : StoreRetvalInst; -def StoreRetvalI8 : StoreRetvalInst; -def StoreRetvalV2I64 : StoreRetvalV2Inst; -def StoreRetvalV2I32 : StoreRetvalV2Inst; -def StoreRetvalV2I16 : StoreRetvalV2Inst; -def StoreRetvalV2I8 : StoreRetvalV2Inst; -def StoreRetvalV4I32 : StoreRetvalV4Inst; -def StoreRetvalV4I16 : StoreRetvalV4Inst; -def StoreRetvalV4I8 : StoreRetvalV4Inst; - -def StoreRetvalF64 : StoreRetvalInst; -def StoreRetvalF32 : StoreRetvalInst; -def StoreRetvalF16 : StoreRetvalInst; -def StoreRetvalF16x2 : StoreRetvalInst; -def StoreRetvalV2F64 : StoreRetvalV2Inst; -def StoreRetvalV2F32 : StoreRetvalV2Inst; -def StoreRetvalV2F16 : StoreRetvalV2Inst; -def StoreRetvalV2F16x2: StoreRetvalV2Inst; -def StoreRetvalV4F32 : StoreRetvalV4Inst; -def StoreRetvalV4F16 : StoreRetvalV4Inst; -def StoreRetvalV4F16x2: StoreRetvalV4Inst; - -def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; -def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; -def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; -def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; - -class CallArgInst : - NVPTXInst<(outs), (ins regclass:$a), "$a, ", - [(CallArg (i32 0), regclass:$a)]>; - -class LastCallArgInst : - NVPTXInst<(outs), (ins regclass:$a), "$a", - [(LastCallArg (i32 0), regclass:$a)]>; - -def CallArgI64 : CallArgInst; -def CallArgI32 : CallArgInst; -def CallArgI16 : CallArgInst; -def CallArgF64 : CallArgInst; -def CallArgF32 : CallArgInst; - -def LastCallArgI64 : LastCallArgInst; -def LastCallArgI32 : LastCallArgInst; -def LastCallArgI16 : LastCallArgInst; -def LastCallArgF64 : LastCallArgInst; -def LastCallArgF32 : LastCallArgInst; - -def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", - [(CallArg (i32 0), (i32 imm:$a))]>; -def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", - [(LastCallArg (i32 0), (i32 imm:$a))]>; - -def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", - [(CallArg (i32 1), (i32 imm:$a))]>; -def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", - [(LastCallArg (i32 1), (i32 imm:$a))]>; - -def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", - [(CallVoid (Wrapper tglobaladdr:$addr))]>; -def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", - [(CallVoid Int32Regs:$addr)]>; -def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", - [(CallVoid Int64Regs:$addr)]>; -def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", - [(Prototype (i32 imm:$val))]>; - -def DeclareRetMemInst : - NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num), - ".param .align $align .b8 retval$num[$size];", - [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; -def DeclareRetScalarInst : - NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), - ".param .b$size retval$num;", - [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; -def DeclareRetRegInst : - NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), - ".reg .b$size retval$num;", - [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; - -def DeclareParamInst : - NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), - ".param .align $align .b8 param$a[$size];", - [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; -def DeclareScalarParamInst : - NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), - ".param .b$size param$a;", - [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; -def DeclareScalarRegInst : - NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), - ".reg .b$size param$a;", - [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; - -class MoveParamInst : - NVPTXInst<(outs regclass:$dst), (ins regclass:$src), - !strconcat("mov", asmstr, " \t$dst, $src;"), - [(set regclass:$dst, (MoveParam regclass:$src))]>; - -def MoveParamI64 : MoveParamInst; -def MoveParamI32 : MoveParamInst; -def MoveParamI16 : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), - "cvt.u16.u32 \t$dst, $src;", - [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>; -def MoveParamF64 : MoveParamInst; -def MoveParamF32 : MoveParamInst; -def MoveParamF16 : MoveParamInst; - -class PseudoUseParamInst : - NVPTXInst<(outs), (ins regclass:$src), - "// Pseudo use of $src", - [(PseudoUseParam regclass:$src)]>; - -def PseudoUseParamI64 : PseudoUseParamInst; -def PseudoUseParamI32 : PseudoUseParamInst; -def PseudoUseParamI16 : PseudoUseParamInst; -def PseudoUseParamF64 : PseudoUseParamInst; -def PseudoUseParamF32 : PseudoUseParamInst; - - -// -// Load / Store Handling -// -multiclass LD { - def _avar : NVPTXInst< - (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr];", []>; - def _areg : NVPTXInst< - (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr];", []>; - def _areg_64 : NVPTXInst< - (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr];", []>; - def _ari : NVPTXInst< - (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr+$offset];", []>; - def _ari_64 : NVPTXInst< - (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr+$offset];", []>; - def _asi : NVPTXInst< - (outs regclass:$dst), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t$dst, [$addr+$offset];", []>; -} - -let mayLoad=1, hasSideEffects=0 in { - defm LD_i8 : LD; - defm LD_i16 : LD; - defm LD_i32 : LD; - defm LD_i64 : LD; - defm LD_f16 : LD; - defm LD_f16x2 : LD; - defm LD_f32 : LD; - defm LD_f64 : LD; -} - -multiclass ST { - def _avar : NVPTXInst< - (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr], $src;", []>; - def _areg : NVPTXInst< - (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr], $src;", []>; - def _areg_64 : NVPTXInst< - (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr], $src;", []>; - def _ari : NVPTXInst< - (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr+$offset], $src;", []>; - def _ari_64 : NVPTXInst< - (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr+$offset], $src;", []>; - def _asi : NVPTXInst< - (outs), - (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, - LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" - " \t[$addr+$offset], $src;", []>; -} - -let mayStore=1, hasSideEffects=0 in { - defm ST_i8 : ST; - defm ST_i16 : ST; - defm ST_i32 : ST; - defm ST_i64 : ST; - defm ST_f16 : ST; - defm ST_f16x2 : ST; - defm ST_f32 : ST; - defm ST_f64 : ST; -} - -// The following is used only in and after vector elementizations. Vector -// elementization happens at the machine instruction level, so the following -// instructions never appear in the DAG. -multiclass LD_VEC { - def _v2_avar : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr];", []>; - def _v2_areg : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr];", []>; - def _v2_areg_64 : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr];", []>; - def _v2_ari : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; - def _v2_ari_64 : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; - def _v2_asi : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; - def _v4_avar : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; - def _v4_areg : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; - def _v4_areg_64 : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; - def _v4_ari : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; - def _v4_ari_64 : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; - def _v4_asi : NVPTXInst< - (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; -} -let mayLoad=1, hasSideEffects=0 in { - defm LDV_i8 : LD_VEC; - defm LDV_i16 : LD_VEC; - defm LDV_i32 : LD_VEC; - defm LDV_i64 : LD_VEC; - defm LDV_f16 : LD_VEC; - defm LDV_f16x2 : LD_VEC; - defm LDV_f32 : LD_VEC; - defm LDV_f64 : LD_VEC; -} - -multiclass ST_VEC { - def _v2_avar : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr], {{$src1, $src2}};", []>; - def _v2_areg : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr], {{$src1, $src2}};", []>; - def _v2_areg_64 : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr], {{$src1, $src2}};", []>; - def _v2_ari : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, - i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2}};", []>; - def _v2_ari_64 : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, - i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2}};", []>; - def _v2_asi : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, - i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2}};", []>; - def _v4_avar : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; - def _v4_areg : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; - def _v4_areg_64 : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; - def _v4_ari : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; - def _v4_ari_64 : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " - "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; - def _v4_asi : NVPTXInst< - (outs), - (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, - i32imm:$fromWidth, imem:$addr, i32imm:$offset), - "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" - "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; -} - -let mayStore=1, hasSideEffects=0 in { - defm STV_i8 : ST_VEC; - defm STV_i16 : ST_VEC; - defm STV_i32 : ST_VEC; - defm STV_i64 : ST_VEC; - defm STV_f16 : ST_VEC; - defm STV_f16x2 : ST_VEC; - defm STV_f32 : ST_VEC; - defm STV_f64 : ST_VEC; -} - -//---- Conversion ---- - -class F_BITCONVERT : - NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), - !strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")), - [(set regclassOut:$d, (bitconvert regclassIn:$a))]>; - -def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>; -def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>; -def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>; -def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>; -def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>; -def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>; -def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>; -def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>; - -// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where -// we cannot specify floating-point literals in isel patterns. Therefore, we -// use an integer selp to select either 1 or 0 and then cvt to floating-point. - -// sint -> f16 -def : Pat<(f16 (sint_to_fp Int1Regs:$a)), - (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; -def : Pat<(f16 (sint_to_fp Int16Regs:$a)), - (CVT_f16_s16 Int16Regs:$a, CvtRN)>; -def : Pat<(f16 (sint_to_fp Int32Regs:$a)), - (CVT_f16_s32 Int32Regs:$a, CvtRN)>; -def : Pat<(f16 (sint_to_fp Int64Regs:$a)), - (CVT_f16_s64 Int64Regs:$a, CvtRN)>; - -// uint -> f16 -def : Pat<(f16 (uint_to_fp Int1Regs:$a)), - (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; -def : Pat<(f16 (uint_to_fp Int16Regs:$a)), - (CVT_f16_u16 Int16Regs:$a, CvtRN)>; -def : Pat<(f16 (uint_to_fp Int32Regs:$a)), - (CVT_f16_u32 Int32Regs:$a, CvtRN)>; -def : Pat<(f16 (uint_to_fp Int64Regs:$a)), - (CVT_f16_u64 Int64Regs:$a, CvtRN)>; - -// sint -> f32 -def : Pat<(f32 (sint_to_fp Int1Regs:$a)), - (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; -def : Pat<(f32 (sint_to_fp Int16Regs:$a)), - (CVT_f32_s16 Int16Regs:$a, CvtRN)>; -def : Pat<(f32 (sint_to_fp Int32Regs:$a)), - (CVT_f32_s32 Int32Regs:$a, CvtRN)>; -def : Pat<(f32 (sint_to_fp Int64Regs:$a)), - (CVT_f32_s64 Int64Regs:$a, CvtRN)>; - -// uint -> f32 -def : Pat<(f32 (uint_to_fp Int1Regs:$a)), - (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; -def : Pat<(f32 (uint_to_fp Int16Regs:$a)), - (CVT_f32_u16 Int16Regs:$a, CvtRN)>; -def : Pat<(f32 (uint_to_fp Int32Regs:$a)), - (CVT_f32_u32 Int32Regs:$a, CvtRN)>; -def : Pat<(f32 (uint_to_fp Int64Regs:$a)), - (CVT_f32_u64 Int64Regs:$a, CvtRN)>; - -// sint -> f64 -def : Pat<(f64 (sint_to_fp Int1Regs:$a)), - (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; -def : Pat<(f64 (sint_to_fp Int16Regs:$a)), - (CVT_f64_s16 Int16Regs:$a, CvtRN)>; -def : Pat<(f64 (sint_to_fp Int32Regs:$a)), - (CVT_f64_s32 Int32Regs:$a, CvtRN)>; -def : Pat<(f64 (sint_to_fp Int64Regs:$a)), - (CVT_f64_s64 Int64Regs:$a, CvtRN)>; - -// uint -> f64 -def : Pat<(f64 (uint_to_fp Int1Regs:$a)), - (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; -def : Pat<(f64 (uint_to_fp Int16Regs:$a)), - (CVT_f64_u16 Int16Regs:$a, CvtRN)>; -def : Pat<(f64 (uint_to_fp Int32Regs:$a)), - (CVT_f64_u32 Int32Regs:$a, CvtRN)>; -def : Pat<(f64 (uint_to_fp Int64Regs:$a)), - (CVT_f64_u64 Int64Regs:$a, CvtRN)>; - - -// f16 -> sint -def : Pat<(i1 (fp_to_sint Float16Regs:$a)), - (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>; -def : Pat<(i16 (fp_to_sint Float16Regs:$a)), - (CVT_s16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i16 (fp_to_sint Float16Regs:$a)), - (CVT_s16_f16 Float16Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_sint Float16Regs:$a)), - (CVT_s32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i32 (fp_to_sint Float16Regs:$a)), - (CVT_s32_f16 Float16Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_sint Float16Regs:$a)), - (CVT_s64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i64 (fp_to_sint Float16Regs:$a)), - (CVT_s64_f16 Float16Regs:$a, CvtRZI)>; - -// f16 -> uint -def : Pat<(i1 (fp_to_uint Float16Regs:$a)), - (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>; -def : Pat<(i16 (fp_to_uint Float16Regs:$a)), - (CVT_u16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i16 (fp_to_uint Float16Regs:$a)), - (CVT_u16_f16 Float16Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_uint Float16Regs:$a)), - (CVT_u32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i32 (fp_to_uint Float16Regs:$a)), - (CVT_u32_f16 Float16Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_uint Float16Regs:$a)), - (CVT_u64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i64 (fp_to_uint Float16Regs:$a)), - (CVT_u64_f16 Float16Regs:$a, CvtRZI)>; - -// f32 -> sint -def : Pat<(i1 (fp_to_sint Float32Regs:$a)), - (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; -def : Pat<(i16 (fp_to_sint Float32Regs:$a)), - (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i16 (fp_to_sint Float32Regs:$a)), - (CVT_s16_f32 Float32Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_sint Float32Regs:$a)), - (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i32 (fp_to_sint Float32Regs:$a)), - (CVT_s32_f32 Float32Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_sint Float32Regs:$a)), - (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i64 (fp_to_sint Float32Regs:$a)), - (CVT_s64_f32 Float32Regs:$a, CvtRZI)>; - -// f32 -> uint -def : Pat<(i1 (fp_to_uint Float32Regs:$a)), - (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; -def : Pat<(i16 (fp_to_uint Float32Regs:$a)), - (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i16 (fp_to_uint Float32Regs:$a)), - (CVT_u16_f32 Float32Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_uint Float32Regs:$a)), - (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i32 (fp_to_uint Float32Regs:$a)), - (CVT_u32_f32 Float32Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_uint Float32Regs:$a)), - (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(i64 (fp_to_uint Float32Regs:$a)), - (CVT_u64_f32 Float32Regs:$a, CvtRZI)>; - -// f64 -> sint -def : Pat<(i1 (fp_to_sint Float64Regs:$a)), - (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; -def : Pat<(i16 (fp_to_sint Float64Regs:$a)), - (CVT_s16_f64 Float64Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_sint Float64Regs:$a)), - (CVT_s32_f64 Float64Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_sint Float64Regs:$a)), - (CVT_s64_f64 Float64Regs:$a, CvtRZI)>; - -// f64 -> uint -def : Pat<(i1 (fp_to_uint Float64Regs:$a)), - (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; -def : Pat<(i16 (fp_to_uint Float64Regs:$a)), - (CVT_u16_f64 Float64Regs:$a, CvtRZI)>; -def : Pat<(i32 (fp_to_uint Float64Regs:$a)), - (CVT_u32_f64 Float64Regs:$a, CvtRZI)>; -def : Pat<(i64 (fp_to_uint Float64Regs:$a)), - (CVT_u64_f64 Float64Regs:$a, CvtRZI)>; - -// sext i1 -def : Pat<(i16 (sext Int1Regs:$a)), - (SELP_s16ii -1, 0, Int1Regs:$a)>; -def : Pat<(i32 (sext Int1Regs:$a)), - (SELP_s32ii -1, 0, Int1Regs:$a)>; -def : Pat<(i64 (sext Int1Regs:$a)), - (SELP_s64ii -1, 0, Int1Regs:$a)>; - -// zext i1 -def : Pat<(i16 (zext Int1Regs:$a)), - (SELP_u16ii 1, 0, Int1Regs:$a)>; -def : Pat<(i32 (zext Int1Regs:$a)), - (SELP_u32ii 1, 0, Int1Regs:$a)>; -def : Pat<(i64 (zext Int1Regs:$a)), - (SELP_u64ii 1, 0, Int1Regs:$a)>; - -// anyext i1 -def : Pat<(i16 (anyext Int1Regs:$a)), - (SELP_u16ii -1, 0, Int1Regs:$a)>; -def : Pat<(i32 (anyext Int1Regs:$a)), - (SELP_u32ii -1, 0, Int1Regs:$a)>; -def : Pat<(i64 (anyext Int1Regs:$a)), - (SELP_u64ii -1, 0, Int1Regs:$a)>; - -// sext i16 -def : Pat<(i32 (sext Int16Regs:$a)), - (CVT_s32_s16 Int16Regs:$a, CvtNONE)>; -def : Pat<(i64 (sext Int16Regs:$a)), - (CVT_s64_s16 Int16Regs:$a, CvtNONE)>; - -// zext i16 -def : Pat<(i32 (zext Int16Regs:$a)), - (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; -def : Pat<(i64 (zext Int16Regs:$a)), - (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; - -// anyext i16 -def : Pat<(i32 (anyext Int16Regs:$a)), - (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; -def : Pat<(i64 (anyext Int16Regs:$a)), - (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; - -// sext i32 -def : Pat<(i64 (sext Int32Regs:$a)), - (CVT_s64_s32 Int32Regs:$a, CvtNONE)>; - -// zext i32 -def : Pat<(i64 (zext Int32Regs:$a)), - (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; - -// anyext i32 -def : Pat<(i64 (anyext Int32Regs:$a)), - (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; - - -// truncate i64 -def : Pat<(i32 (trunc Int64Regs:$a)), - (CVT_u32_u64 Int64Regs:$a, CvtNONE)>; -def : Pat<(i16 (trunc Int64Regs:$a)), - (CVT_u16_u64 Int64Regs:$a, CvtNONE)>; -def : Pat<(i1 (trunc Int64Regs:$a)), - (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>; - -// truncate i32 -def : Pat<(i16 (trunc Int32Regs:$a)), - (CVT_u16_u32 Int32Regs:$a, CvtNONE)>; -def : Pat<(i1 (trunc Int32Regs:$a)), - (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>; - -// truncate i16 -def : Pat<(i1 (trunc Int16Regs:$a)), - (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>; - -// sext_inreg -def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>; -def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>; -def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>; -def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>; -def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>; -def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>; - - -// Select instructions with 32-bit predicates -def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b), - (SELP_b16rr Int16Regs:$a, Int16Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b), - (SELP_b32rr Int32Regs:$a, Int32Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b), - (SELP_b64rr Int64Regs:$a, Int64Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b), - (SELP_f16rr Float16Regs:$a, Float16Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b), - (SELP_f32rr Float32Regs:$a, Float32Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; -def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b), - (SELP_f64rr Float64Regs:$a, Float64Regs:$b, - (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; - - -let hasSideEffects = 0 in { - // pack a set of smaller int registers to a larger int register - def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), - (ins Int16Regs:$s1, Int16Regs:$s2, - Int16Regs:$s3, Int16Regs:$s4), - "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>; - def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), - (ins Int16Regs:$s1, Int16Regs:$s2), - "mov.b32 \t$d, {{$s1, $s2}};", []>; - def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), - (ins Int32Regs:$s1, Int32Regs:$s2), - "mov.b64 \t$d, {{$s1, $s2}};", []>; - def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), - (ins Float32Regs:$s1, Float32Regs:$s2), - "mov.b64 \t$d, {{$s1, $s2}};", []>; - - // unpack a larger int register to a set of smaller int registers - def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, - Int16Regs:$d3, Int16Regs:$d4), - (ins Int64Regs:$s), - "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>; - def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), - (ins Int32Regs:$s), - "mov.b32 \t{{$d1, $d2}}, $s;", []>; - def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), - (ins Int64Regs:$s), - "mov.b64 \t{{$d1, $d2}}, $s;", []>; - def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), - (ins Float64Regs:$s), - "mov.b64 \t{{$d1, $d2}}, $s;", []>; - -} - -let hasSideEffects = 0 in { - // Extract element of f16x2 register. PTX does not provide any way - // to access elements of f16x2 vector directly, so we need to - // extract it using a temporary register. - def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst), - (ins Float16x2Regs:$src), - "{{ .reg .b16 \t%tmp_hi;\n\t" - " mov.b32 \t{$dst, %tmp_hi}, $src; }}", - [(set Float16Regs:$dst, - (extractelt (v2f16 Float16x2Regs:$src), 0))]>; - def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst), - (ins Float16x2Regs:$src), - "{{ .reg .b16 \t%tmp_lo;\n\t" - " mov.b32 \t{%tmp_lo, $dst}, $src; }}", - [(set Float16Regs:$dst, - (extractelt (v2f16 Float16x2Regs:$src), 1))]>; - - // Coalesce two f16 registers into f16x2 - def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst), - (ins Float16Regs:$a, Float16Regs:$b), - "mov.b32 \t$dst, {{$a, $b}};", - [(set Float16x2Regs:$dst, - (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>; - - // Directly initializing underlying the b32 register is one less SASS - // instruction than than vector-packing move. - def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src), - "mov.b32 \t$dst, $src;", - []>; - - // Split f16x2 into two f16 registers. - def SplitF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), - (ins Float16x2Regs:$src), - "mov.b32 \t{{$lo, $hi}}, $src;", - []>; - // Split an i32 into two f16 - def SplitI32toF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), - (ins Int32Regs:$src), - "mov.b32 \t{{$lo, $hi}}, $src;", - []>; -} - -// Count leading zeros -let hasSideEffects = 0 in { - def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), - "clz.b32 \t$d, $a;", []>; - def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "clz.b64 \t$d, $a;", []>; -} - -// 32-bit has a direct PTX instruction -def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>; - -// The return type of the ctlz ISD node is the same as its input, but the PTX -// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the -// ptx value to 64 bits to match the ISD node's semantics, unless we know we're -// truncating back down to 32 bits. -def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; -def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; - -// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the -// result back to 16-bits if necessary. We also need to subtract 16 because -// the high-order 16 zeros were counted. -// -// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could -// use to save one SASS instruction (on sm_35 anyway): -// -// mov.b32 $tmp, {0xffff, $a} -// ctlz.b32 $result, $tmp -// -// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" -// and then ctlz that value. This way we don't have to subtract 16 from the -// result. Unfortunately today we don't have a way to generate -// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. -def : Pat<(ctlz Int16Regs:$a), - (SUBi16ri (CVT_u16_u32 - (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; -def : Pat<(i32 (zext (ctlz Int16Regs:$a))), - (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; - -// Population count -let hasSideEffects = 0 in { - def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), - "popc.b32 \t$d, $a;", []>; - def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "popc.b64 \t$d, $a;", []>; -} - -// 32-bit has a direct PTX instruction -def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>; - -// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit -// to match the LLVM semantics. Just as with ctlz.i64, we provide a second -// pattern that avoids the type conversion if we're truncating the result to -// i32 anyway. -def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; -def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; - -// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. -// If we know that we're storing into an i32, we can avoid the final trunc. -def : Pat<(ctpop Int16Regs:$a), - (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; -def : Pat<(i32 (zext (ctpop Int16Regs:$a))), - (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; - -// fpround f32 -> f16 -def : Pat<(f16 (fpround Float32Regs:$a)), - (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f16 (fpround Float32Regs:$a)), - (CVT_f16_f32 Float32Regs:$a, CvtRN)>; - -// fpround f64 -> f16 -def : Pat<(f16 (fpround Float64Regs:$a)), - (CVT_f16_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f16 (fpround Float64Regs:$a)), - (CVT_f16_f64 Float64Regs:$a, CvtRN)>; - -// fpround f64 -> f32 -def : Pat<(f32 (fpround Float64Regs:$a)), - (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f32 (fpround Float64Regs:$a)), - (CVT_f32_f64 Float64Regs:$a, CvtRN)>; - -// fpextend f16 -> f32 -def : Pat<(f32 (fpextend Float16Regs:$a)), - (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f32 (fpextend Float16Regs:$a)), - (CVT_f32_f16 Float16Regs:$a, CvtNONE)>; - -// fpextend f16 -> f64 -def : Pat<(f64 (fpextend Float16Regs:$a)), - (CVT_f64_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f64 (fpextend Float16Regs:$a)), - (CVT_f64_f16 Float16Regs:$a, CvtNONE)>; - -// fpextend f32 -> f64 -def : Pat<(f64 (fpextend Float32Regs:$a)), - (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f64 (fpextend Float32Regs:$a)), - (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; - -def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; - -// fceil, ffloor, fround, ftrunc. - -def : Pat<(fceil Float16Regs:$a), - (CVT_f16_f16 Float16Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(fceil Float16Regs:$a), - (CVT_f16_f16 Float16Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>; -def : Pat<(fceil Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(fceil Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>; -def : Pat<(fceil Float64Regs:$a), - (CVT_f64_f64 Float64Regs:$a, CvtRPI)>; - -def : Pat<(ffloor Float16Regs:$a), - (CVT_f16_f16 Float16Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(ffloor Float16Regs:$a), - (CVT_f16_f16 Float16Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>; -def : Pat<(ffloor Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(ffloor Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>; -def : Pat<(ffloor Float64Regs:$a), - (CVT_f64_f64 Float64Regs:$a, CvtRMI)>; - -def : Pat<(fround Float16Regs:$a), - (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f16 (fround Float16Regs:$a)), - (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; -def : Pat<(fround Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(f32 (fround Float32Regs:$a)), - (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; -def : Pat<(f64 (fround Float64Regs:$a)), - (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; - -def : Pat<(ftrunc Float16Regs:$a), - (CVT_f16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(ftrunc Float16Regs:$a), - (CVT_f16_f16 Float16Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>; -def : Pat<(ftrunc Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(ftrunc Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>; -def : Pat<(ftrunc Float64Regs:$a), - (CVT_f64_f64 Float64Regs:$a, CvtRZI)>; - -// nearbyint and rint are implemented as rounding to nearest even. This isn't -// strictly correct, because it causes us to ignore the rounding mode. But it -// matches what CUDA's "libm" does. - -def : Pat<(fnearbyint Float16Regs:$a), - (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(fnearbyint Float16Regs:$a), - (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; -def : Pat<(fnearbyint Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(fnearbyint Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; -def : Pat<(fnearbyint Float64Regs:$a), - (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; - -def : Pat<(frint Float16Regs:$a), - (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(frint Float16Regs:$a), - (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; -def : Pat<(frint Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; -def : Pat<(frint Float32Regs:$a), - (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; -def : Pat<(frint Float64Regs:$a), - (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; - - -//----------------------------------- -// Control-flow -//----------------------------------- - -let isTerminator=1 in { - let isReturn=1, isBarrier=1 in - def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>; - - let isBranch=1 in - def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), - "@$a bra \t$target;", - [(brcond Int1Regs:$a, bb:$target)]>; - let isBranch=1 in - def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), - "@!$a bra \t$target;", []>; - - let isBranch=1, isBarrier=1 in - def GOTO : NVPTXInst<(outs), (ins brtarget:$target), - "bra.uni \t$target;", [(br bb:$target)]>; -} - -def : Pat<(brcond Int32Regs:$a, bb:$target), - (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>; - -// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a -// conditional branch if the target block is the next block so that the code -// can fall through to the target block. The invertion is done by 'xor -// condition, 1', which will be translated to (setne condition, -1). Since ptx -// supports '@!pred bra target', we should use it. -def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), - (CBranchOther Int1Regs:$a, bb:$target)>; - -// Call -def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; -def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; - -def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; -def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, - SDNPSideEffect]>; - -def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; -def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; -def calltarget : Operand; -let isCall=1 in { - def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>; -} - -def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>; -def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>; - -// Pseudo instructions. -class Pseudo pattern> - : NVPTXInst; - -def Callseq_Start : - NVPTXInst<(outs), (ins i32imm:$amt), - "\\{ // callseq $amt\n" - "\t.reg .b32 temp_param_reg;", - [(callseq_start timm:$amt)]>; -def Callseq_End : - NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), - "\\} // callseq $amt1", - [(callseq_end timm:$amt1, timm:$amt2)]>; - -// trap instruction -def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>; - -// Call prototype wrapper -def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def CallPrototype : - SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def ProtoIdent : Operand { - let PrintMethod = "printProtoIdent"; -} -def CALL_PROTOTYPE : - NVPTXInst<(outs), (ins ProtoIdent:$ident), - "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; - - -include "NVPTXIntrinsics.td" - - -//----------------------------------- -// Notes -//----------------------------------- -// BSWAP is currently expanded. The following is a more efficient -// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register -// - for sm_20, use pmpt (use vector scalar mov to get the pack and -// unpack). sm_20 supports native 32-bit register, but not native 16-bit -// register. +//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the PTX instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +include "NVPTXInstrFormats.td" + +// A NOP instruction +let hasSideEffects = 0 in { + def NOP : NVPTXInst<(outs), (ins), "", []>; +} + +let OperandType = "OPERAND_IMMEDIATE" in { + def f16imm : Operand; +} + +// List of vector specific properties +def isVecLD : VecInstTypeEnum<1>; +def isVecST : VecInstTypeEnum<2>; +def isVecBuild : VecInstTypeEnum<3>; +def isVecShuffle : VecInstTypeEnum<4>; +def isVecExtract : VecInstTypeEnum<5>; +def isVecInsert : VecInstTypeEnum<6>; +def isVecDest : VecInstTypeEnum<7>; +def isVecOther : VecInstTypeEnum<15>; + +//===----------------------------------------------------------------------===// +// NVPTX Operand Definitions. +//===----------------------------------------------------------------------===// + +def brtarget : Operand; + +// CVT conversion modes +// These must match the enum in NVPTX.h +def CvtNONE : PatLeaf<(i32 0x0)>; +def CvtRNI : PatLeaf<(i32 0x1)>; +def CvtRZI : PatLeaf<(i32 0x2)>; +def CvtRMI : PatLeaf<(i32 0x3)>; +def CvtRPI : PatLeaf<(i32 0x4)>; +def CvtRN : PatLeaf<(i32 0x5)>; +def CvtRZ : PatLeaf<(i32 0x6)>; +def CvtRM : PatLeaf<(i32 0x7)>; +def CvtRP : PatLeaf<(i32 0x8)>; + +def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; +def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; +def CvtRZI_FTZ : PatLeaf<(i32 0x12)>; +def CvtRMI_FTZ : PatLeaf<(i32 0x13)>; +def CvtRPI_FTZ : PatLeaf<(i32 0x14)>; +def CvtRN_FTZ : PatLeaf<(i32 0x15)>; +def CvtRZ_FTZ : PatLeaf<(i32 0x16)>; +def CvtRM_FTZ : PatLeaf<(i32 0x17)>; +def CvtRP_FTZ : PatLeaf<(i32 0x18)>; + +def CvtSAT : PatLeaf<(i32 0x20)>; +def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; + +def CvtMode : Operand { + let PrintMethod = "printCvtMode"; +} + +// Compare modes +// These must match the enum in NVPTX.h +def CmpEQ : PatLeaf<(i32 0)>; +def CmpNE : PatLeaf<(i32 1)>; +def CmpLT : PatLeaf<(i32 2)>; +def CmpLE : PatLeaf<(i32 3)>; +def CmpGT : PatLeaf<(i32 4)>; +def CmpGE : PatLeaf<(i32 5)>; +def CmpEQU : PatLeaf<(i32 10)>; +def CmpNEU : PatLeaf<(i32 11)>; +def CmpLTU : PatLeaf<(i32 12)>; +def CmpLEU : PatLeaf<(i32 13)>; +def CmpGTU : PatLeaf<(i32 14)>; +def CmpGEU : PatLeaf<(i32 15)>; +def CmpNUM : PatLeaf<(i32 16)>; +def CmpNAN : PatLeaf<(i32 17)>; + +def CmpEQ_FTZ : PatLeaf<(i32 0x100)>; +def CmpNE_FTZ : PatLeaf<(i32 0x101)>; +def CmpLT_FTZ : PatLeaf<(i32 0x102)>; +def CmpLE_FTZ : PatLeaf<(i32 0x103)>; +def CmpGT_FTZ : PatLeaf<(i32 0x104)>; +def CmpGE_FTZ : PatLeaf<(i32 0x105)>; +def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>; +def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>; +def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>; +def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>; +def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>; +def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>; +def CmpNUM_FTZ : PatLeaf<(i32 0x110)>; +def CmpNAN_FTZ : PatLeaf<(i32 0x111)>; + +def CmpMode : Operand { + let PrintMethod = "printCmpMode"; +} +def VecElement : Operand { + let PrintMethod = "printVecElement"; +} + +//===----------------------------------------------------------------------===// +// NVPTX Instruction Predicate Definitions +//===----------------------------------------------------------------------===// + + +def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">; +def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">; +def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">; +def useAtomRedG32forGen32 : + Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">; +def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">; +def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">; +def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">; +def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">; +def useAtomRedG64forGen64 : + Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">; +def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">; +def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; +def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; +def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; +def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; +def hasVote : Predicate<"Subtarget->hasVote()">; +def hasDouble : Predicate<"Subtarget->hasDouble()">; +def reqPTX20 : Predicate<"Subtarget->reqPTX20()">; +def hasLDG : Predicate<"Subtarget->hasLDG()">; +def hasLDU : Predicate<"Subtarget->hasLDU()">; +def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">; + +def doF32FTZ : Predicate<"useF32FTZ()">; +def doNoF32FTZ : Predicate<"!useF32FTZ()">; + +def doMulWide : Predicate<"doMulWide">; + +def allowFMA : Predicate<"allowFMA()">; +def noFMA : Predicate<"!allowFMA()">; +def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">; + +def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; +def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; + +def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; +def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; + +def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; +def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; + +def true : Predicate<"true">; + +def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; + +def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; + +//===----------------------------------------------------------------------===// +// Some Common Instruction Class Templates +//===----------------------------------------------------------------------===// + +// Template for instructions which take three int64, int32, or int16 args. +// The instructions are named "" (e.g. "add.s64"). +multiclass I3 { + def i64rr : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; + def i64ri : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; + def i32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def i32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def i16rr : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; + def i16ri : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; +} + +// Template for instructions which take 3 int32 args. The instructions are +// named ".s32" (e.g. "addc.cc.s32"). +multiclass ADD_SUB_INT_32 { + def i32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def i32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; +} + +// Template for instructions which take three fp64 or fp32 args. The +// instructions are named ".f" (e.g. "min.f64"). +// +// Also defines ftz (flush subnormal inputs and results to sign-preserving +// zero) variants for fp32 functions. +// +// This multiclass should be used for nodes that cannot be folded into FMAs. +// For nodes that can be folded into FMAs (i.e. adds and muls), use +// F3_fma_component. +multiclass F3 { + def f64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>; + def f64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>; + def f32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[doF32FTZ]>; + def f32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[doF32FTZ]>; + def f32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>; + def f32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>; +} + +// Template for instructions which take three FP args. The +// instructions are named ".f" (e.g. "add.f64"). +// +// Also defines ftz (flush subnormal inputs and results to sign-preserving +// zero) variants for fp32/fp16 functions. +// +// This multiclass should be used for nodes that can be folded to make fma ops. +// In this case, we use the ".rn" variant when FMA is disabled, as this behaves +// just like the non ".rn" op, but prevents ptxas from creating FMAs. +multiclass F3_fma_component { + def f64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, + Requires<[allowFMA]>; + def f64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, + Requires<[allowFMA]>; + def f32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[allowFMA, doF32FTZ]>; + def f32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[allowFMA, doF32FTZ]>; + def f32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[allowFMA]>; + def f32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[allowFMA]>; + + def f16rr_ftz : + NVPTXInst<(outs Float16Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), + [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, + Requires<[useFP16Math, allowFMA, doF32FTZ]>; + def f16rr : + NVPTXInst<(outs Float16Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), + [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, + Requires<[useFP16Math, allowFMA]>; + + def f16x2rr_ftz : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, allowFMA, doF32FTZ]>; + def f16x2rr : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, allowFMA]>; + + // These have strange names so we don't perturb existing mir tests. + def _rnf64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>, + Requires<[noFMA]>; + def _rnf64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>, + Requires<[noFMA]>; + def _rnf32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[noFMA, doF32FTZ]>; + def _rnf32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[noFMA, doF32FTZ]>; + def _rnf32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>, + Requires<[noFMA]>; + def _rnf32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>, + Requires<[noFMA]>; + def _rnf16rr_ftz : + NVPTXInst<(outs Float16Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"), + [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, + Requires<[useFP16Math, noFMA, doF32FTZ]>; + def _rnf16rr : + NVPTXInst<(outs Float16Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), + [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>, + Requires<[useFP16Math, noFMA]>; + def _rnf16x2rr_ftz : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, noFMA, doF32FTZ]>; + def _rnf16x2rr : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b), + !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), + [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>, + Requires<[useFP16Math, noFMA]>; +} + +// Template for operations which take two f32 or f64 operands. Provides three +// instructions: .f64, .f32, and .ftz.f32 (flush +// subnormal inputs and results to zero). +multiclass F2 { + def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), + !strconcat(OpcStr, ".f64 \t$dst, $a;"), + [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>; + def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), + !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>, + Requires<[doF32FTZ]>; + def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), + !strconcat(OpcStr, ".f32 \t$dst, $a;"), + [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>; +} + +//===----------------------------------------------------------------------===// +// NVPTX Instructions. +//===----------------------------------------------------------------------===// + +//----------------------------------- +// Type Conversion +//----------------------------------- + +let hasSideEffects = 0 in { + // Generate a cvt to the given type from all possible types. Each instance + // takes a CvtMode immediate that defines the conversion mode to use. It can + // be CvtNONE to omit a conversion mode. + multiclass CVT_FROM_ALL { + def _s8 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s8 \t$dst, $src;"), []>; + def _u8 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u8 \t$dst, $src;"), []>; + def _s16 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s16 \t$dst, $src;"), []>; + def _u16 : + NVPTXInst<(outs RC:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u16 \t$dst, $src;"), []>; + def _s32 : + NVPTXInst<(outs RC:$dst), + (ins Int32Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s32 \t$dst, $src;"), []>; + def _u32 : + NVPTXInst<(outs RC:$dst), + (ins Int32Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u32 \t$dst, $src;"), []>; + def _s64 : + NVPTXInst<(outs RC:$dst), + (ins Int64Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".s64 \t$dst, $src;"), []>; + def _u64 : + NVPTXInst<(outs RC:$dst), + (ins Int64Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".u64 \t$dst, $src;"), []>; + def _f16 : + NVPTXInst<(outs RC:$dst), + (ins Float16Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".f16 \t$dst, $src;"), []>; + def _f32 : + NVPTXInst<(outs RC:$dst), + (ins Float32Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".f32 \t$dst, $src;"), []>; + def _f64 : + NVPTXInst<(outs RC:$dst), + (ins Float64Regs:$src, CvtMode:$mode), + !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", + FromName, ".f64 \t$dst, $src;"), []>; + } + + // Generate cvts from all types to all types. + defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; + defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; + defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; + defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; + defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; + defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; + defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; + defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; + defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>; + defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; + defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; + + // These cvts are different from those above: The source and dest registers + // are of the same type. + def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "cvt.s16.s8 \t$dst, $src;", []>; + def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "cvt.s32.s8 \t$dst, $src;", []>; + def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "cvt.s32.s16 \t$dst, $src;", []>; + def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "cvt.s64.s8 \t$dst, $src;", []>; + def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "cvt.s64.s16 \t$dst, $src;", []>; + def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "cvt.s64.s32 \t$dst, $src;", []>; +} + +//----------------------------------- +// Integer Arithmetic +//----------------------------------- + +// Template for xor masquerading as int1 arithmetic. +multiclass ADD_SUB_i1 { + def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), + "xor.pred \t$dst, $a, $b;", + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; + def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), + "xor.pred \t$dst, $a, $b;", + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>; +} + +// int1 addition and subtraction are both just xor. +defm ADD_i1 : ADD_SUB_i1; +defm SUB_i1 : ADD_SUB_i1; + +// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we +// also use these for unsigned arithmetic. +defm ADD : I3<"add.s", add>; +defm SUB : I3<"sub.s", sub>; + +// int32 addition and subtraction with carry-out. +// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?). +defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>; +defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>; + +// int32 addition and subtraction with carry-in and carry-out. +defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>; +defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>; + +defm MULT : I3<"mul.lo.s", mul>; + +defm MULTHS : I3<"mul.hi.s", mulhs>; +defm MULTHU : I3<"mul.hi.u", mulhu>; + +defm SDIV : I3<"div.s", sdiv>; +defm UDIV : I3<"div.u", udiv>; + +// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM +// will lower it. +defm SREM : I3<"rem.s", srem>; +defm UREM : I3<"rem.u", urem>; + +// Integer absolute value. NumBits should be one minus the bit width of RC. +// This idiom implements the algorithm at +// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs. +multiclass ABS { + def : NVPTXInst<(outs RC:$dst), (ins RC:$a), + !strconcat("abs", SizeName, " \t$dst, $a;"), + [(set RC:$dst, (abs RC:$a))]>; +} +defm ABS_16 : ABS; +defm ABS_32 : ABS; +defm ABS_64 : ABS; + +// Integer min/max. +defm SMAX : I3<"max.s", smax>; +defm UMAX : I3<"max.u", umax>; +defm SMIN : I3<"min.s", smin>; +defm UMIN : I3<"min.u", umin>; + +// +// Wide multiplication +// +def MULWIDES64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; +def MULWIDES64Imm : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; +def MULWIDES64Imm64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), + "mul.wide.s32 \t$dst, $a, $b;", []>; + +def MULWIDEU64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; +def MULWIDEU64Imm : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; +def MULWIDEU64Imm64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), + "mul.wide.u32 \t$dst, $a, $b;", []>; + +def MULWIDES32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; +def MULWIDES32Imm : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; +def MULWIDES32Imm32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + "mul.wide.s16 \t$dst, $a, $b;", []>; + +def MULWIDEU32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; +def MULWIDEU32Imm32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + "mul.wide.u16 \t$dst, $a, $b;", []>; + +def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; +def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; +def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; + +// Matchers for signed, unsigned mul.wide ISD nodes. +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)), + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)), + (MULWIDES32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)), + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)), + (MULWIDEU32Imm Int16Regs:$a, imm:$b)>, + Requires<[doMulWide]>; + +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)), + (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)), + (MULWIDES64Imm Int32Regs:$a, imm:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)), + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)), + (MULWIDEU64Imm Int32Regs:$a, imm:$b)>, + Requires<[doMulWide]>; + +// Predicates used for converting some patterns to mul.wide. +def SInt32Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + return v.isSignedIntN(32); +}]>; + +def UInt32Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + return v.isIntN(32); +}]>; + +def SInt16Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + return v.isSignedIntN(16); +}]>; + +def UInt16Const : PatLeaf<(imm), [{ + const APInt &v = N->getAPIntValue(); + return v.isIntN(16); +}]>; + +def Int5Const : PatLeaf<(imm), [{ + // Check if 0 <= v < 32; only then will the result of (x << v) be an int32. + const APInt &v = N->getAPIntValue(); + return v.sge(0) && v.slt(32); +}]>; + +def Int4Const : PatLeaf<(imm), [{ + // Check if 0 <= v < 16; only then will the result of (x << v) be an int16. + const APInt &v = N->getAPIntValue(); + return v.sge(0) && v.slt(16); +}]>; + +def SHL2MUL32 : SDNodeXFormgetAPIntValue(); + APInt temp(32, 1); + return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); +}]>; + +def SHL2MUL16 : SDNodeXFormgetAPIntValue(); + APInt temp(16, 1); + return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); +}]>; + +// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. +def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)), + (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, + Requires<[doMulWide]>; +def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)), + (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>, + Requires<[doMulWide]>; + +def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)), + (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, + Requires<[doMulWide]>; +def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)), + (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>, + Requires<[doMulWide]>; + +// Convert "sign/zero-extend then multiply" to mul.wide. +def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)), + (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)), + (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>, + Requires<[doMulWide]>; + +def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)), + (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)), + (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>, + Requires<[doMulWide]>; + +def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)), + (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)), + (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>, + Requires<[doMulWide]>; + +def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)), + (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, + Requires<[doMulWide]>; +def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), + (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>, + Requires<[doMulWide]>; + +// +// Integer multiply-add +// +def SDTIMAD : + SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, + SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>; +def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>; + +def MAD16rrr : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>; +def MAD16rri : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>; +def MAD16rir : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>; +def MAD16rii : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int16Regs:$a, i16imm:$b, i16imm:$c), + "mad.lo.s16 \t$dst, $a, $b, $c;", + [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>; + +def MAD32rrr : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>; +def MAD32rri : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>; +def MAD32rir : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>; +def MAD32rii : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, i32imm:$b, i32imm:$c), + "mad.lo.s32 \t$dst, $a, $b, $c;", + [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>; + +def MAD64rrr : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>; +def MAD64rri : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>; +def MAD64rir : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>; +def MAD64rii : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$a, i64imm:$b, i64imm:$c), + "mad.lo.s64 \t$dst, $a, $b, $c;", + [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>; + +def INEG16 : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "neg.s16 \t$dst, $src;", + [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; +def INEG32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "neg.s32 \t$dst, $src;", + [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>; +def INEG64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "neg.s64 \t$dst, $src;", + [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>; + +//----------------------------------- +// Floating Point Arithmetic +//----------------------------------- + +// Constant 1.0f +def FloatConst1 : PatLeaf<(fpimm), [{ + return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() && + N->getValueAPF().convertToFloat() == 1.0f; +}]>; +// Constant 1.0 (double) +def DoubleConst1 : PatLeaf<(fpimm), [{ + return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && + N->getValueAPF().convertToDouble() == 1.0; +}]>; + +// Loads FP16 constant into a register. +// +// ptxas does not have hex representation for fp16, so we can't use +// fp16 immediate values in .f16 instructions. Instead we have to load +// the constant into a register using mov.b16. +def LOAD_CONST_F16 : + NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a), + "mov.b16 \t$dst, $a;", []>; + +defm FADD : F3_fma_component<"add", fadd>; +defm FSUB : F3_fma_component<"sub", fsub>; +defm FMUL : F3_fma_component<"mul", fmul>; + +defm FMIN : F3<"min", fminnum>; +defm FMAX : F3<"max", fmaxnum>; + +defm FABS : F2<"abs", fabs>; +defm FNEG : F2<"neg", fneg>; +defm FSQRT : F2<"sqrt.rn", fsqrt>; + +// +// F64 division +// +def FDIV641r : + NVPTXInst<(outs Float64Regs:$dst), + (ins f64imm:$a, Float64Regs:$b), + "rcp.rn.f64 \t$dst, $b;", + [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>; +def FDIV64rr : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, Float64Regs:$b), + "div.rn.f64 \t$dst, $a, $b;", + [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>; +def FDIV64ri : + NVPTXInst<(outs Float64Regs:$dst), + (ins Float64Regs:$a, f64imm:$b), + "div.rn.f64 \t$dst, $a, $b;", + [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>; + +// +// F32 Approximate reciprocal +// +def FDIV321r_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV321r : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX]>; +// +// F32 Approximate division +// +def FDIV32approxrr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.approx.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV32approxri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.approx.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_APPROX, doF32FTZ]>; +def FDIV32approxrr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.approx.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_APPROX]>; +def FDIV32approxri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.approx.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_APPROX]>; +// +// F32 Semi-accurate reciprocal +// +// rcp.approx gives the same result as div.full(1.0f, a) and is faster. +// +def FDIV321r_approx_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV321r_approx : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.approx.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL]>; +// +// F32 Semi-accurate division +// +def FDIV32rr_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.full.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV32ri_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.full.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_FULL, doF32FTZ]>; +def FDIV32rr : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.full.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[do_DIVF32_FULL]>; +def FDIV32ri : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.full.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[do_DIVF32_FULL]>; +// +// F32 Accurate reciprocal +// +def FDIV321r_prec_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.rn.ftz.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[reqPTX20, doF32FTZ]>; +def FDIV321r_prec : + NVPTXInst<(outs Float32Regs:$dst), + (ins f32imm:$a, Float32Regs:$b), + "rcp.rn.f32 \t$dst, $b;", + [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>, + Requires<[reqPTX20]>; +// +// F32 Accurate division +// +def FDIV32rr_prec_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.rn.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[doF32FTZ, reqPTX20]>; +def FDIV32ri_prec_ftz : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.rn.ftz.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[doF32FTZ, reqPTX20]>; +def FDIV32rr_prec : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, Float32Regs:$b), + "div.rn.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>, + Requires<[reqPTX20]>; +def FDIV32ri_prec : + NVPTXInst<(outs Float32Regs:$dst), + (ins Float32Regs:$a, f32imm:$b), + "div.rn.f32 \t$dst, $a, $b;", + [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>, + Requires<[reqPTX20]>; + +// +// FMA +// + +multiclass FMA { + def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, + Requires<[Pred]>; + def rri : NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, ImmCls:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, + Requires<[Pred]>; + def rir : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, RC:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, + Requires<[Pred]>; + def rii : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, ImmCls:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, + Requires<[Pred]>; +} + +multiclass FMA_F16 { + def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), + !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), + [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, + Requires<[useFP16Math, Pred]>; +} + +defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>; +defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, true>; +defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>; +defm FMA16x2 : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>; +defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; +defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, true>; +defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, true>; + +// sin/cos +def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), + "sin.approx.f32 \t$dst, $src;", + [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>, + Requires<[allowUnsafeFPMath]>; +def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), + "cos.approx.f32 \t$dst, $src;", + [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>, + Requires<[allowUnsafeFPMath]>; + +// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)), +// i.e. "poor man's fmod()" + +// frem - f32 FTZ +def : Pat<(frem Float32Regs:$x, Float32Regs:$y), + (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32 + (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ), + Float32Regs:$y))>, + Requires<[doF32FTZ]>; +def : Pat<(frem Float32Regs:$x, fpimm:$y), + (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32 + (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ), + fpimm:$y))>, + Requires<[doF32FTZ]>; + +// frem - f32 +def : Pat<(frem Float32Regs:$x, Float32Regs:$y), + (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32 + (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI), + Float32Regs:$y))>; +def : Pat<(frem Float32Regs:$x, fpimm:$y), + (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32 + (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI), + fpimm:$y))>; + +// frem - f64 +def : Pat<(frem Float64Regs:$x, Float64Regs:$y), + (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64 + (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI), + Float64Regs:$y))>; +def : Pat<(frem Float64Regs:$x, fpimm:$y), + (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64 + (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI), + fpimm:$y))>; + +//----------------------------------- +// Bitwise operations +//----------------------------------- + +// Template for three-arg bitwise operations. Takes three args, Creates .b16, +// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. +multiclass BITWISE { + def b1rr : + NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), + !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>; + def b1ri : + NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), + !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), + [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; + def b16rr : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), + !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; + def b16ri : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), + !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>; + def b32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def b32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; + def b64rr : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), + !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>; + def b64ri : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), + !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; +} + +defm OR : BITWISE<"or", or>; +defm AND : BITWISE<"and", and>; +defm XOR : BITWISE<"xor", xor>; + +def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), + "not.pred \t$dst, $src;", + [(set Int1Regs:$dst, (not Int1Regs:$src))]>; +def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "not.b16 \t$dst, $src;", + [(set Int16Regs:$dst, (not Int16Regs:$src))]>; +def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), + "not.b32 \t$dst, $src;", + [(set Int32Regs:$dst, (not Int32Regs:$src))]>; +def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), + "not.b64 \t$dst, $src;", + [(set Int64Regs:$dst, (not Int64Regs:$src))]>; + +// Template for left/right shifts. Takes three operands, +// [dest (reg), src (reg), shift (reg or imm)]. +// dest and src may be int64, int32, or int16, but shift is always int32. +// +// This template also defines a 32-bit shift (imm, imm) instruction. +multiclass SHIFT { + def i64rr : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>; + def i64ri : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), + !strconcat(OpcStr, "64 \t$dst, $a, $b;"), + [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>; + def i32rr : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>; + def i32ri : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>; + def i32ii : + NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), + !strconcat(OpcStr, "32 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; + def i16rr : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>; + def i16ri : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), + !strconcat(OpcStr, "16 \t$dst, $a, $b;"), + [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; +} + +defm SHL : SHIFT<"shl.b", shl>; +defm SRA : SHIFT<"shr.s", sra>; +defm SRL : SHIFT<"shr.u", srl>; + +// Bit-reverse +def BREV32 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), + "brev.b32 \t$dst, $a;", + [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>; +def BREV64 : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a), + "brev.b64 \t$dst, $a;", + [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; + +// +// Rotate: Use ptx shf instruction if available. +// + +// 32 bit r2 = rotl r1, n +// => +// r2 = shf.l r1, r1, n +def ROTL32imm_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTL32reg_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// 32 bit r2 = rotr r1, n +// => +// r2 = shf.r r1, r1, n +def ROTR32imm_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + +def ROTR32reg_hw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[hasHWROT32]>; + +// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. +def ROT32imm_sw : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + "shl.b32 \t%lhs, $src, $amt1;\n\t" + "shr.b32 \t%rhs, $src, $amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + []>; + +def SUB_FRM_32 : SDNodeXFormgetTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, + Requires<[noHWROT32]>; +def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)), + (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, + Requires<[noHWROT32]>; + +// 32-bit software rotate left by register. +def ROTL32reg_sw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + ".reg .b32 %amt2;\n\t" + "shl.b32 \t%lhs, $src, $amt;\n\t" + "sub.s32 \t%amt2, 32, $amt;\n\t" + "shr.b32 \t%rhs, $src, %amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; + +// 32-bit software rotate right by register. +def ROTR32reg_sw : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b32 %lhs;\n\t" + ".reg .b32 %rhs;\n\t" + ".reg .b32 %amt2;\n\t" + "shr.b32 \t%lhs, $src, $amt;\n\t" + "sub.s32 \t%amt2, 32, $amt;\n\t" + "shl.b32 \t%rhs, $src, %amt2;\n\t" + "add.u32 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>, + Requires<[noHWROT32]>; + +// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. +def ROT64imm_sw : + NVPTXInst<(outs Int64Regs:$dst), + (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + "shl.b64 \t%lhs, $src, $amt1;\n\t" + "shr.b64 \t%rhs, $src, $amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + []>; + +def SUB_FRM_64 : SDNodeXFormgetTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; +def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), + (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; + +// 64-bit software rotate left by register. +def ROTL64reg_sw : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + ".reg .u32 %amt2;\n\t" + "shl.b64 \t%lhs, $src, $amt;\n\t" + "sub.u32 \t%amt2, 64, $amt;\n\t" + "shr.b64 \t%rhs, $src, %amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>; + +def ROTR64reg_sw : + NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), + "{{\n\t" + ".reg .b64 %lhs;\n\t" + ".reg .b64 %rhs;\n\t" + ".reg .u32 %amt2;\n\t" + "shr.b64 \t%lhs, $src, $amt;\n\t" + "sub.u32 \t%amt2, 64, $amt;\n\t" + "shl.b64 \t%rhs, $src, %amt2;\n\t" + "add.u64 \t$dst, %lhs, %rhs;\n\t" + "}}", + [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>; + +// +// Funnnel shift in clamp mode +// + +// Create SDNodes so they can be used in the DAG code, e.g. +// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) +def SDTIntShiftDOp : + SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; +def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; +def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; + +def FUNSHFLCLAMP : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; + +def FUNSHFRCLAMP : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>; + +// +// BFE - bit-field extract +// + +// Template for BFE instructions. Takes four args, +// [dest (reg), src (reg), start (reg or imm), end (reg or imm)]. +// Start may be an imm only if end is also an imm. FIXME: Is this a +// restriction in PTX? +// +// dest and src may be int32 or int64, but start and end are always int32. +multiclass BFE { + def rrr + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, Int32Regs:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rri + : NVPTXInst<(outs RC:$d), + (ins RC:$a, Int32Regs:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; + def rii + : NVPTXInst<(outs RC:$d), + (ins RC:$a, i32imm:$b, i32imm:$c), + !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>; +} + +let hasSideEffects = 0 in { + defm BFE_S32 : BFE<"s32", Int32Regs>; + defm BFE_U32 : BFE<"u32", Int32Regs>; + defm BFE_S64 : BFE<"s64", Int64Regs>; + defm BFE_U64 : BFE<"u64", Int64Regs>; +} + +//----------------------------------- +// Comparison instructions (setp, set) +//----------------------------------- + +// FIXME: This doesn't cover versions of set and setp that combine with a +// boolean predicate, e.g. setp.eq.and.b16. + +let hasSideEffects = 0 in { + multiclass SETP { + def rr : + NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp), + !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, + " \t$dst, $a, $b;"), []>; + def ri : + NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp), + !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, + " \t$dst, $a, $b;"), []>; + def ir : + NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp), + !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, + " \t$dst, $a, $b;"), []>; + } +} + +defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>; +defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>; +defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>; +defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>; +defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>; +defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>; +defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>; +defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>; +defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>; +defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>; +defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; +def SETP_f16rr : + NVPTXInst<(outs Int1Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp), + "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;", + []>, Requires<[useFP16Math]>; + +def SETP_f16x2rr : + NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), + (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp), + "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;", + []>, + Requires<[useFP16Math]>; + + +// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form +// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination +// reg, either u32, s32, or f32. Anyway these aren't used at the moment. + +let hasSideEffects = 0 in { + multiclass SET { + def rr : NVPTXInst<(outs Int32Regs:$dst), + (ins RC:$a, RC:$b, CmpMode:$cmp), + !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; + def ri : NVPTXInst<(outs Int32Regs:$dst), + (ins RC:$a, ImmCls:$b, CmpMode:$cmp), + !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; + def ir : NVPTXInst<(outs Int32Regs:$dst), + (ins ImmCls:$a, RC:$b, CmpMode:$cmp), + !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; + } +} + +defm SET_b16 : SET<"b16", Int16Regs, i16imm>; +defm SET_s16 : SET<"s16", Int16Regs, i16imm>; +defm SET_u16 : SET<"u16", Int16Regs, i16imm>; +defm SET_b32 : SET<"b32", Int32Regs, i32imm>; +defm SET_s32 : SET<"s32", Int32Regs, i32imm>; +defm SET_u32 : SET<"u32", Int32Regs, i32imm>; +defm SET_b64 : SET<"b64", Int64Regs, i64imm>; +defm SET_s64 : SET<"s64", Int64Regs, i64imm>; +defm SET_u64 : SET<"u64", Int64Regs, i64imm>; +defm SET_f16 : SET<"f16", Float16Regs, f16imm>; +defm SET_f32 : SET<"f32", Float32Regs, f32imm>; +defm SET_f64 : SET<"f64", Float64Regs, f64imm>; + +//----------------------------------- +// Selection instructions (selp) +//----------------------------------- + +// FIXME: Missing slct + +// selp instructions that don't have any pattern matches; we explicitly use +// them within this file. +let hasSideEffects = 0 in { + multiclass SELP { + def rr : NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; + def ri : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; + def ir : NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; + def ii : NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; + } + + multiclass SELP_PATTERN { + def rr : + NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>; + def ri : + NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>; + def ir : + NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, RC:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>; + def ii : + NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), + !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), + [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>; + } +} + +// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as +// good. +defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>; +defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>; +defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>; +defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>; +defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>; +defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>; +defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>; +defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>; +defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>; +defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>; +defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>; +defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>; + +def SELP_f16x2rr : + NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p), + "selp.b32 \t$dst, $a, $b, $p;", + [(set Float16x2Regs:$dst, + (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>; + +//----------------------------------- +// Data Movement (Load / Store, Move) +//----------------------------------- + +def ADDRri : ComplexPattern; +def ADDRri64 : ComplexPattern; + +def MEMri : Operand { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops Int32Regs, i32imm); +} +def MEMri64 : Operand { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops Int64Regs, i64imm); +} + +def imem : Operand { + let PrintMethod = "printOperand"; +} + +def imemAny : Operand { + let PrintMethod = "printOperand"; +} + +def LdStCode : Operand { + let PrintMethod = "printLdStCode"; +} + +def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; +def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; + +// Load a memory address into a u32 or u64 register. +def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), + "mov.u32 \t$dst, $a;", + [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>; +def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), + "mov.u64 \t$dst, $a;", + [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>; + +// Get pointer to local stack. +let hasSideEffects = 0 in { + def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), + "mov.u32 \t$d, __local_depot$num;", []>; + def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), + "mov.u64 \t$d, __local_depot$num;", []>; +} + + +// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp +let IsSimpleMove=1, hasSideEffects=0 in { + def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), + "mov.pred \t$dst, $sss;", []>; + def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), + "mov.u16 \t$dst, $sss;", []>; + def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), + "mov.u32 \t$dst, $sss;", []>; + def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), + "mov.u64 \t$dst, $sss;", []>; + + def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src), + // We have to use .b16 here as there's no mov.f16. + "mov.b16 \t$dst, $src;", []>; + def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), + "mov.f32 \t$dst, $src;", []>; + def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), + "mov.f64 \t$dst, $src;", []>; +} + +def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), + "mov.pred \t$dst, $src;", + [(set Int1Regs:$dst, imm:$src)]>; +def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), + "mov.u16 \t$dst, $src;", + [(set Int16Regs:$dst, imm:$src)]>; +def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), + "mov.u32 \t$dst, $src;", + [(set Int32Regs:$dst, imm:$src)]>; +def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), + "mov.u64 \t$dst, $src;", + [(set Int64Regs:$dst, imm:$src)]>; + +def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), + "mov.f32 \t$dst, $src;", + [(set Float32Regs:$dst, fpimm:$src)]>; +def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), + "mov.f64 \t$dst, $src;", + [(set Float64Regs:$dst, fpimm:$src)]>; + +def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; + +//---- Copy Frame Index ---- +def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), + "add.u32 \t$dst, ${addr:add};", + [(set Int32Regs:$dst, ADDRri:$addr)]>; +def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), + "add.u64 \t$dst, ${addr:add};", + [(set Int64Regs:$dst, ADDRri64:$addr)]>; + +//----------------------------------- +// Comparison and Selection +//----------------------------------- + +multiclass ISET_FORMAT { + // i16 -> pred + def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)), + (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; + def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)), + (setp_16ri Int16Regs:$a, imm:$b, Mode)>; + def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)), + (setp_16ir imm:$a, Int16Regs:$b, Mode)>; + // i32 -> pred + def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)), + (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; + def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)), + (setp_32ri Int32Regs:$a, imm:$b, Mode)>; + def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)), + (setp_32ir imm:$a, Int32Regs:$b, Mode)>; + // i64 -> pred + def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)), + (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; + def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)), + (setp_64ri Int64Regs:$a, imm:$b, Mode)>; + def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)), + (setp_64ir imm:$a, Int64Regs:$b, Mode)>; + + // i16 -> i32 + def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)), + (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>; + def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)), + (set_16ri Int16Regs:$a, imm:$b, Mode)>; + def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)), + (set_16ir imm:$a, Int16Regs:$b, Mode)>; + // i32 -> i32 + def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)), + (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>; + def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)), + (set_32ri Int32Regs:$a, imm:$b, Mode)>; + def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)), + (set_32ir imm:$a, Int32Regs:$b, Mode)>; + // i64 -> i32 + def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)), + (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>; + def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)), + (set_64ri Int64Regs:$a, imm:$b, Mode)>; + def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)), + (set_64ir imm:$a, Int64Regs:$b, Mode)>; +} + +multiclass ISET_FORMAT_SIGNED + : ISET_FORMAT { + // TableGen doesn't like empty multiclasses. + def : PatLeaf<(i32 0)>; +} + +multiclass ISET_FORMAT_UNSIGNED + : ISET_FORMAT { + // TableGen doesn't like empty multiclasses. + def : PatLeaf<(i32 0)>; +} + +defm : ISET_FORMAT_SIGNED; +defm : ISET_FORMAT_SIGNED; +defm : ISET_FORMAT_SIGNED; +defm : ISET_FORMAT_SIGNED; +defm : ISET_FORMAT_SIGNED; +defm : ISET_FORMAT_SIGNED; +defm : ISET_FORMAT_UNSIGNED; +defm : ISET_FORMAT_UNSIGNED; +defm : ISET_FORMAT_UNSIGNED; +defm : ISET_FORMAT_UNSIGNED; +defm : ISET_FORMAT_UNSIGNED; +defm : ISET_FORMAT_UNSIGNED; + +// i1 compares +def : Pat<(setne Int1Regs:$a, Int1Regs:$b), + (XORb1rr Int1Regs:$a, Int1Regs:$b)>; +def : Pat<(setune Int1Regs:$a, Int1Regs:$b), + (XORb1rr Int1Regs:$a, Int1Regs:$b)>; + +def : Pat<(seteq Int1Regs:$a, Int1Regs:$b), + (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; +def : Pat<(setueq Int1Regs:$a, Int1Regs:$b), + (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>; + +// i1 compare -> i32 +def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), + (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; +def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)), + (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>; + + + +multiclass FSET_FORMAT { + // f16 -> pred + def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)), + (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>, + Requires<[useFP16Math,doF32FTZ]>; + def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)), + (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>, + Requires<[useFP16Math]>; + def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)), + (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, + Requires<[useFP16Math,doF32FTZ]>; + def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)), + (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, + Requires<[useFP16Math]>; + def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)), + (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>, + Requires<[useFP16Math,doF32FTZ]>; + def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)), + (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>, + Requires<[useFP16Math]>; + + // f32 -> pred + def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), + (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)), + (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; + def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), + (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), + (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>; + def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), + (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)), + (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>; + + // f64 -> pred + def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)), + (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; + def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)), + (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>; + def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)), + (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>; + + // f16 -> i32 + def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)), + (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>, + Requires<[useFP16Math, doF32FTZ]>; + def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)), + (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>, + Requires<[useFP16Math]>; + def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)), + (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>, + Requires<[useFP16Math, doF32FTZ]>; + def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)), + (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>, + Requires<[useFP16Math]>; + def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)), + (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>, + Requires<[useFP16Math, doF32FTZ]>; + def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)), + (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>, + Requires<[useFP16Math]>; + + // f32 -> i32 + def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), + (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)), + (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>; + def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), + (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)), + (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>; + def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), + (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>, + Requires<[doF32FTZ]>; + def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)), + (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>; + + // f64 -> i32 + def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)), + (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>; + def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)), + (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>; + def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)), + (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>; +} + +defm FSetOGT : FSET_FORMAT; +defm FSetOLT : FSET_FORMAT; +defm FSetOGE : FSET_FORMAT; +defm FSetOLE : FSET_FORMAT; +defm FSetOEQ : FSET_FORMAT; +defm FSetONE : FSET_FORMAT; + +defm FSetUGT : FSET_FORMAT; +defm FSetULT : FSET_FORMAT; +defm FSetUGE : FSET_FORMAT; +defm FSetULE : FSET_FORMAT; +defm FSetUEQ : FSET_FORMAT; +defm FSetUNE : FSET_FORMAT; + +defm FSetGT : FSET_FORMAT; +defm FSetLT : FSET_FORMAT; +defm FSetGE : FSET_FORMAT; +defm FSetLE : FSET_FORMAT; +defm FSetEQ : FSET_FORMAT; +defm FSetNE : FSET_FORMAT; + +defm FSetNUM : FSET_FORMAT; +defm FSetNAN : FSET_FORMAT; + +// FIXME: What is this doing here? Can it be deleted? +// def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad, +// [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def SDTDeclareParamProfile : + SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; +def SDTDeclareScalarParamProfile : + SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; +def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; +def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; +def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; +def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; +def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; +def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; +def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; +def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; +def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>; +def SDTCallVoidProfile : SDTypeProfile<0, 1, []>; +def SDTCallValProfile : SDTypeProfile<1, 0, []>; +def SDTMoveParamProfile : SDTypeProfile<1, 1, []>; +def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; +def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>; +def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>; +def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; + +def DeclareParam : + SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareScalarParam : + SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareRetParam : + SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def DeclareRet : + SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def LoadParam : + SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, + [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; +def LoadParamV2 : + SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, + [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; +def LoadParamV4 : + SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, + [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; +def PrintCall : + SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def PrintConvergentCall : + SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def PrintCallUni : + SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def PrintConvergentCallUni : + SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParam : + SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamV2 : + SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamV4 : + SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamU32 : + SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def StoreParamS32 : + SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArgBegin : + SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArg : + SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def LastCallArg : + SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallArgEnd : + SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallVoid : + SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def Prototype : + SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def CallVal : + SDNode<"NVPTXISD::CallVal", SDTCallValProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def MoveParam : + SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; +def StoreRetval : + SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, + [SDNPHasChain, SDNPSideEffect]>; +def StoreRetvalV2 : + SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, + [SDNPHasChain, SDNPSideEffect]>; +def StoreRetvalV4 : + SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, + [SDNPHasChain, SDNPSideEffect]>; +def PseudoUseParam : + SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def RETURNNode : + SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, + [SDNPHasChain, SDNPSideEffect]>; + +let mayLoad = 1 in { + class LoadParamMemInst : + NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), + !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"), + []>; + + class LoadParamV2MemInst : + NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), + !strconcat("ld.param.v2", opstr, + " \t{{$dst, $dst2}}, [retval0+$b];"), []>; + + class LoadParamV4MemInst : + NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, + regclass:$dst4), + (ins i32imm:$b), + !strconcat("ld.param.v4", opstr, + " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), + []>; +} + +class LoadParamRegInst : + NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), + !strconcat("mov", opstr, " \t$dst, retval$b;"), + [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; + +let mayStore = 1 in { + class StoreParamInst : + NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), + !strconcat("st.param", opstr, " \t[param$a+$b], $val;"), + []>; + + class StoreParamV2Inst : + NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, + i32imm:$a, i32imm:$b), + !strconcat("st.param.v2", opstr, + " \t[param$a+$b], {{$val, $val2}};"), + []>; + + class StoreParamV4Inst : + NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3, + regclass:$val4, i32imm:$a, + i32imm:$b), + !strconcat("st.param.v4", opstr, + " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"), + []>; + + class StoreRetvalInst : + NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), + !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"), + []>; + + class StoreRetvalV2Inst : + NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), + !strconcat("st.param.v2", opstr, + " \t[func_retval0+$a], {{$val, $val2}};"), + []>; + + class StoreRetvalV4Inst : + NVPTXInst<(outs), + (ins regclass:$val, regclass:$val2, regclass:$val3, + regclass:$val4, i32imm:$a), + !strconcat("st.param.v4", opstr, + " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), + []>; +} + +let isCall=1 in { + multiclass CALL { + def PrintCallNoRetInst : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " "), [(OpNode (i32 0))]>; + def PrintCallRetInst1 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>; + def PrintCallRetInst2 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>; + def PrintCallRetInst3 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>; + def PrintCallRetInst4 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "), + [(OpNode (i32 4))]>; + def PrintCallRetInst5 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "), + [(OpNode (i32 5))]>; + def PrintCallRetInst6 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " + "retval5), "), + [(OpNode (i32 6))]>; + def PrintCallRetInst7 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " + "retval5, retval6), "), + [(OpNode (i32 7))]>; + def PrintCallRetInst8 : NVPTXInst<(outs), (ins), + !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " + "retval5, retval6, retval7), "), + [(OpNode (i32 8))]>; + } +} + +defm Call : CALL<"call", PrintCall>; +defm CallUni : CALL<"call.uni", PrintCallUni>; + +// Convergent call instructions. These are identical to regular calls, except +// they have the isConvergent bit set. +let isConvergent=1 in { + defm ConvergentCall : CALL<"call", PrintConvergentCall>; + defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>; +} + +def LoadParamMemI64 : LoadParamMemInst; +def LoadParamMemI32 : LoadParamMemInst; +def LoadParamMemI16 : LoadParamMemInst; +def LoadParamMemI8 : LoadParamMemInst; +def LoadParamMemV2I64 : LoadParamV2MemInst; +def LoadParamMemV2I32 : LoadParamV2MemInst; +def LoadParamMemV2I16 : LoadParamV2MemInst; +def LoadParamMemV2I8 : LoadParamV2MemInst; +def LoadParamMemV4I32 : LoadParamV4MemInst; +def LoadParamMemV4I16 : LoadParamV4MemInst; +def LoadParamMemV4I8 : LoadParamV4MemInst; +def LoadParamMemF16 : LoadParamMemInst; +def LoadParamMemF16x2 : LoadParamMemInst; +def LoadParamMemF32 : LoadParamMemInst; +def LoadParamMemF64 : LoadParamMemInst; +def LoadParamMemV2F16 : LoadParamV2MemInst; +def LoadParamMemV2F16x2: LoadParamV2MemInst; +def LoadParamMemV2F32 : LoadParamV2MemInst; +def LoadParamMemV2F64 : LoadParamV2MemInst; +def LoadParamMemV4F16 : LoadParamV4MemInst; +def LoadParamMemV4F16x2: LoadParamV4MemInst; +def LoadParamMemV4F32 : LoadParamV4MemInst; + +def StoreParamI64 : StoreParamInst; +def StoreParamI32 : StoreParamInst; + +def StoreParamI16 : StoreParamInst; +def StoreParamI8 : StoreParamInst; +def StoreParamV2I64 : StoreParamV2Inst; +def StoreParamV2I32 : StoreParamV2Inst; +def StoreParamV2I16 : StoreParamV2Inst; +def StoreParamV2I8 : StoreParamV2Inst; + +def StoreParamV4I32 : StoreParamV4Inst; +def StoreParamV4I16 : StoreParamV4Inst; +def StoreParamV4I8 : StoreParamV4Inst; + +def StoreParamF16 : StoreParamInst; +def StoreParamF16x2 : StoreParamInst; +def StoreParamF32 : StoreParamInst; +def StoreParamF64 : StoreParamInst; +def StoreParamV2F16 : StoreParamV2Inst; +def StoreParamV2F16x2 : StoreParamV2Inst; +def StoreParamV2F32 : StoreParamV2Inst; +def StoreParamV2F64 : StoreParamV2Inst; +def StoreParamV4F16 : StoreParamV4Inst; +def StoreParamV4F16x2 : StoreParamV4Inst; +def StoreParamV4F32 : StoreParamV4Inst; + +def StoreRetvalI64 : StoreRetvalInst; +def StoreRetvalI32 : StoreRetvalInst; +def StoreRetvalI16 : StoreRetvalInst; +def StoreRetvalI8 : StoreRetvalInst; +def StoreRetvalV2I64 : StoreRetvalV2Inst; +def StoreRetvalV2I32 : StoreRetvalV2Inst; +def StoreRetvalV2I16 : StoreRetvalV2Inst; +def StoreRetvalV2I8 : StoreRetvalV2Inst; +def StoreRetvalV4I32 : StoreRetvalV4Inst; +def StoreRetvalV4I16 : StoreRetvalV4Inst; +def StoreRetvalV4I8 : StoreRetvalV4Inst; + +def StoreRetvalF64 : StoreRetvalInst; +def StoreRetvalF32 : StoreRetvalInst; +def StoreRetvalF16 : StoreRetvalInst; +def StoreRetvalF16x2 : StoreRetvalInst; +def StoreRetvalV2F64 : StoreRetvalV2Inst; +def StoreRetvalV2F32 : StoreRetvalV2Inst; +def StoreRetvalV2F16 : StoreRetvalV2Inst; +def StoreRetvalV2F16x2: StoreRetvalV2Inst; +def StoreRetvalV4F32 : StoreRetvalV4Inst; +def StoreRetvalV4F16 : StoreRetvalV4Inst; +def StoreRetvalV4F16x2: StoreRetvalV4Inst; + +def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; +def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; +def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; +def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; + +class CallArgInst : + NVPTXInst<(outs), (ins regclass:$a), "$a, ", + [(CallArg (i32 0), regclass:$a)]>; + +class LastCallArgInst : + NVPTXInst<(outs), (ins regclass:$a), "$a", + [(LastCallArg (i32 0), regclass:$a)]>; + +def CallArgI64 : CallArgInst; +def CallArgI32 : CallArgInst; +def CallArgI16 : CallArgInst; +def CallArgF64 : CallArgInst; +def CallArgF32 : CallArgInst; + +def LastCallArgI64 : LastCallArgInst; +def LastCallArgI32 : LastCallArgInst; +def LastCallArgI16 : LastCallArgInst; +def LastCallArgF64 : LastCallArgInst; +def LastCallArgF32 : LastCallArgInst; + +def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", + [(CallArg (i32 0), (i32 imm:$a))]>; +def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", + [(LastCallArg (i32 0), (i32 imm:$a))]>; + +def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", + [(CallArg (i32 1), (i32 imm:$a))]>; +def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", + [(LastCallArg (i32 1), (i32 imm:$a))]>; + +def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", + [(CallVoid (Wrapper tglobaladdr:$addr))]>; +def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", + [(CallVoid Int32Regs:$addr)]>; +def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", + [(CallVoid Int64Regs:$addr)]>; +def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", + [(Prototype (i32 imm:$val))]>; + +def DeclareRetMemInst : + NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num), + ".param .align $align .b8 retval$num[$size];", + [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; +def DeclareRetScalarInst : + NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), + ".param .b$size retval$num;", + [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; +def DeclareRetRegInst : + NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), + ".reg .b$size retval$num;", + [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; + +def DeclareParamInst : + NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), + ".param .align $align .b8 param$a[$size];", + [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; +def DeclareScalarParamInst : + NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), + ".param .b$size param$a;", + [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; +def DeclareScalarRegInst : + NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), + ".reg .b$size param$a;", + [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; + +class MoveParamInst : + NVPTXInst<(outs regclass:$dst), (ins regclass:$src), + !strconcat("mov", asmstr, " \t$dst, $src;"), + [(set regclass:$dst, (MoveParam regclass:$src))]>; + +def MoveParamI64 : MoveParamInst; +def MoveParamI32 : MoveParamInst; +def MoveParamI16 : + NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), + "cvt.u16.u32 \t$dst, $src;", + [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>; +def MoveParamF64 : MoveParamInst; +def MoveParamF32 : MoveParamInst; +def MoveParamF16 : MoveParamInst; + +class PseudoUseParamInst : + NVPTXInst<(outs), (ins regclass:$src), + "// Pseudo use of $src", + [(PseudoUseParam regclass:$src)]>; + +def PseudoUseParamI64 : PseudoUseParamInst; +def PseudoUseParamI32 : PseudoUseParamInst; +def PseudoUseParamI16 : PseudoUseParamInst; +def PseudoUseParamF64 : PseudoUseParamInst; +def PseudoUseParamF32 : PseudoUseParamInst; + + +// +// Load / Store Handling +// +multiclass LD { + def _avar : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr];", []>; + def _areg : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr];", []>; + def _areg_64 : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr];", []>; + def _ari : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr+$offset];", []>; + def _ari_64 : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr+$offset];", []>; + def _asi : NVPTXInst< + (outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t$dst, [$addr+$offset];", []>; +} + +let mayLoad=1, hasSideEffects=0 in { + defm LD_i8 : LD; + defm LD_i16 : LD; + defm LD_i32 : LD; + defm LD_i64 : LD; + defm LD_f16 : LD; + defm LD_f16x2 : LD; + defm LD_f32 : LD; + defm LD_f64 : LD; +} + +multiclass ST { + def _avar : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, imem:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr], $src;", []>; + def _areg : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr], $src;", []>; + def _areg_64 : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr], $src;", []>; + def _ari : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr+$offset], $src;", []>; + def _ari_64 : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr+$offset], $src;", []>; + def _asi : NVPTXInst< + (outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + " \t[$addr+$offset], $src;", []>; +} + +let mayStore=1, hasSideEffects=0 in { + defm ST_i8 : ST; + defm ST_i16 : ST; + defm ST_i32 : ST; + defm ST_i64 : ST; + defm ST_f16 : ST; + defm ST_f16x2 : ST; + defm ST_f32 : ST; + defm ST_f64 : ST; +} + +// The following is used only in and after vector elementizations. Vector +// elementization happens at the machine instruction level, so the following +// instructions never appear in the DAG. +multiclass LD_VEC { + def _v2_avar : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr];", []>; + def _v2_areg : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr];", []>; + def _v2_areg_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr];", []>; + def _v2_ari : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + def _v2_ari_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + def _v2_asi : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2}}, [$addr+$offset];", []>; + def _v4_avar : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; + def _v4_areg : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; + def _v4_areg_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; + def _v4_ari : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + def _v4_ari_64 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; + def _v4_asi : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>; +} +let mayLoad=1, hasSideEffects=0 in { + defm LDV_i8 : LD_VEC; + defm LDV_i16 : LD_VEC; + defm LDV_i32 : LD_VEC; + defm LDV_i64 : LD_VEC; + defm LDV_f16 : LD_VEC; + defm LDV_f16x2 : LD_VEC; + defm LDV_f32 : LD_VEC; + defm LDV_f64 : LD_VEC; +} + +multiclass ST_VEC { + def _v2_avar : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2}};", []>; + def _v2_areg : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2}};", []>; + def _v2_areg_64 : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2}};", []>; + def _v2_ari : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, + i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2}};", []>; + def _v2_ari_64 : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, + i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2}};", []>; + def _v2_asi : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, + i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2}};", []>; + def _v4_avar : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_areg : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_areg_64 : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_ari : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_ari_64 : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; + def _v4_asi : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, imem:$addr, i32imm:$offset), + "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}" + "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>; +} + +let mayStore=1, hasSideEffects=0 in { + defm STV_i8 : ST_VEC; + defm STV_i16 : ST_VEC; + defm STV_i32 : ST_VEC; + defm STV_i64 : ST_VEC; + defm STV_f16 : ST_VEC; + defm STV_f16x2 : ST_VEC; + defm STV_f32 : ST_VEC; + defm STV_f64 : ST_VEC; +} + +//---- Conversion ---- + +class F_BITCONVERT : + NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), + !strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")), + [(set regclassOut:$d, (bitconvert regclassIn:$a))]>; + +def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>; +def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>; +def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>; +def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>; +def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>; +def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>; +def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>; +def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>; + +// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where +// we cannot specify floating-point literals in isel patterns. Therefore, we +// use an integer selp to select either 1 or 0 and then cvt to floating-point. + +// sint -> f16 +def : Pat<(f16 (sint_to_fp Int1Regs:$a)), + (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f16 (sint_to_fp Int16Regs:$a)), + (CVT_f16_s16 Int16Regs:$a, CvtRN)>; +def : Pat<(f16 (sint_to_fp Int32Regs:$a)), + (CVT_f16_s32 Int32Regs:$a, CvtRN)>; +def : Pat<(f16 (sint_to_fp Int64Regs:$a)), + (CVT_f16_s64 Int64Regs:$a, CvtRN)>; + +// uint -> f16 +def : Pat<(f16 (uint_to_fp Int1Regs:$a)), + (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f16 (uint_to_fp Int16Regs:$a)), + (CVT_f16_u16 Int16Regs:$a, CvtRN)>; +def : Pat<(f16 (uint_to_fp Int32Regs:$a)), + (CVT_f16_u32 Int32Regs:$a, CvtRN)>; +def : Pat<(f16 (uint_to_fp Int64Regs:$a)), + (CVT_f16_u64 Int64Regs:$a, CvtRN)>; + +// sint -> f32 +def : Pat<(f32 (sint_to_fp Int1Regs:$a)), + (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f32 (sint_to_fp Int16Regs:$a)), + (CVT_f32_s16 Int16Regs:$a, CvtRN)>; +def : Pat<(f32 (sint_to_fp Int32Regs:$a)), + (CVT_f32_s32 Int32Regs:$a, CvtRN)>; +def : Pat<(f32 (sint_to_fp Int64Regs:$a)), + (CVT_f32_s64 Int64Regs:$a, CvtRN)>; + +// uint -> f32 +def : Pat<(f32 (uint_to_fp Int1Regs:$a)), + (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f32 (uint_to_fp Int16Regs:$a)), + (CVT_f32_u16 Int16Regs:$a, CvtRN)>; +def : Pat<(f32 (uint_to_fp Int32Regs:$a)), + (CVT_f32_u32 Int32Regs:$a, CvtRN)>; +def : Pat<(f32 (uint_to_fp Int64Regs:$a)), + (CVT_f32_u64 Int64Regs:$a, CvtRN)>; + +// sint -> f64 +def : Pat<(f64 (sint_to_fp Int1Regs:$a)), + (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f64 (sint_to_fp Int16Regs:$a)), + (CVT_f64_s16 Int16Regs:$a, CvtRN)>; +def : Pat<(f64 (sint_to_fp Int32Regs:$a)), + (CVT_f64_s32 Int32Regs:$a, CvtRN)>; +def : Pat<(f64 (sint_to_fp Int64Regs:$a)), + (CVT_f64_s64 Int64Regs:$a, CvtRN)>; + +// uint -> f64 +def : Pat<(f64 (uint_to_fp Int1Regs:$a)), + (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>; +def : Pat<(f64 (uint_to_fp Int16Regs:$a)), + (CVT_f64_u16 Int16Regs:$a, CvtRN)>; +def : Pat<(f64 (uint_to_fp Int32Regs:$a)), + (CVT_f64_u32 Int32Regs:$a, CvtRN)>; +def : Pat<(f64 (uint_to_fp Int64Regs:$a)), + (CVT_f64_u64 Int64Regs:$a, CvtRN)>; + + +// f16 -> sint +def : Pat<(i1 (fp_to_sint Float16Regs:$a)), + (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_sint Float16Regs:$a)), + (CVT_s16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i16 (fp_to_sint Float16Regs:$a)), + (CVT_s16_f16 Float16Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_sint Float16Regs:$a)), + (CVT_s32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i32 (fp_to_sint Float16Regs:$a)), + (CVT_s32_f16 Float16Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_sint Float16Regs:$a)), + (CVT_s64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i64 (fp_to_sint Float16Regs:$a)), + (CVT_s64_f16 Float16Regs:$a, CvtRZI)>; + +// f16 -> uint +def : Pat<(i1 (fp_to_uint Float16Regs:$a)), + (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_uint Float16Regs:$a)), + (CVT_u16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i16 (fp_to_uint Float16Regs:$a)), + (CVT_u16_f16 Float16Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_uint Float16Regs:$a)), + (CVT_u32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i32 (fp_to_uint Float16Regs:$a)), + (CVT_u32_f16 Float16Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_uint Float16Regs:$a)), + (CVT_u64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i64 (fp_to_uint Float16Regs:$a)), + (CVT_u64_f16 Float16Regs:$a, CvtRZI)>; + +// f32 -> sint +def : Pat<(i1 (fp_to_sint Float32Regs:$a)), + (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_sint Float32Regs:$a)), + (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i16 (fp_to_sint Float32Regs:$a)), + (CVT_s16_f32 Float32Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_sint Float32Regs:$a)), + (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i32 (fp_to_sint Float32Regs:$a)), + (CVT_s32_f32 Float32Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_sint Float32Regs:$a)), + (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i64 (fp_to_sint Float32Regs:$a)), + (CVT_s64_f32 Float32Regs:$a, CvtRZI)>; + +// f32 -> uint +def : Pat<(i1 (fp_to_uint Float32Regs:$a)), + (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_uint Float32Regs:$a)), + (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i16 (fp_to_uint Float32Regs:$a)), + (CVT_u16_f32 Float32Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_uint Float32Regs:$a)), + (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i32 (fp_to_uint Float32Regs:$a)), + (CVT_u32_f32 Float32Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_uint Float32Regs:$a)), + (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(i64 (fp_to_uint Float32Regs:$a)), + (CVT_u64_f32 Float32Regs:$a, CvtRZI)>; + +// f64 -> sint +def : Pat<(i1 (fp_to_sint Float64Regs:$a)), + (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_sint Float64Regs:$a)), + (CVT_s16_f64 Float64Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_sint Float64Regs:$a)), + (CVT_s32_f64 Float64Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_sint Float64Regs:$a)), + (CVT_s64_f64 Float64Regs:$a, CvtRZI)>; + +// f64 -> uint +def : Pat<(i1 (fp_to_uint Float64Regs:$a)), + (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>; +def : Pat<(i16 (fp_to_uint Float64Regs:$a)), + (CVT_u16_f64 Float64Regs:$a, CvtRZI)>; +def : Pat<(i32 (fp_to_uint Float64Regs:$a)), + (CVT_u32_f64 Float64Regs:$a, CvtRZI)>; +def : Pat<(i64 (fp_to_uint Float64Regs:$a)), + (CVT_u64_f64 Float64Regs:$a, CvtRZI)>; + +// sext i1 +def : Pat<(i16 (sext Int1Regs:$a)), + (SELP_s16ii -1, 0, Int1Regs:$a)>; +def : Pat<(i32 (sext Int1Regs:$a)), + (SELP_s32ii -1, 0, Int1Regs:$a)>; +def : Pat<(i64 (sext Int1Regs:$a)), + (SELP_s64ii -1, 0, Int1Regs:$a)>; + +// zext i1 +def : Pat<(i16 (zext Int1Regs:$a)), + (SELP_u16ii 1, 0, Int1Regs:$a)>; +def : Pat<(i32 (zext Int1Regs:$a)), + (SELP_u32ii 1, 0, Int1Regs:$a)>; +def : Pat<(i64 (zext Int1Regs:$a)), + (SELP_u64ii 1, 0, Int1Regs:$a)>; + +// anyext i1 +def : Pat<(i16 (anyext Int1Regs:$a)), + (SELP_u16ii -1, 0, Int1Regs:$a)>; +def : Pat<(i32 (anyext Int1Regs:$a)), + (SELP_u32ii -1, 0, Int1Regs:$a)>; +def : Pat<(i64 (anyext Int1Regs:$a)), + (SELP_u64ii -1, 0, Int1Regs:$a)>; + +// sext i16 +def : Pat<(i32 (sext Int16Regs:$a)), + (CVT_s32_s16 Int16Regs:$a, CvtNONE)>; +def : Pat<(i64 (sext Int16Regs:$a)), + (CVT_s64_s16 Int16Regs:$a, CvtNONE)>; + +// zext i16 +def : Pat<(i32 (zext Int16Regs:$a)), + (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; +def : Pat<(i64 (zext Int16Regs:$a)), + (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; + +// anyext i16 +def : Pat<(i32 (anyext Int16Regs:$a)), + (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; +def : Pat<(i64 (anyext Int16Regs:$a)), + (CVT_u64_u16 Int16Regs:$a, CvtNONE)>; + +// sext i32 +def : Pat<(i64 (sext Int32Regs:$a)), + (CVT_s64_s32 Int32Regs:$a, CvtNONE)>; + +// zext i32 +def : Pat<(i64 (zext Int32Regs:$a)), + (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; + +// anyext i32 +def : Pat<(i64 (anyext Int32Regs:$a)), + (CVT_u64_u32 Int32Regs:$a, CvtNONE)>; + + +// truncate i64 +def : Pat<(i32 (trunc Int64Regs:$a)), + (CVT_u32_u64 Int64Regs:$a, CvtNONE)>; +def : Pat<(i16 (trunc Int64Regs:$a)), + (CVT_u16_u64 Int64Regs:$a, CvtNONE)>; +def : Pat<(i1 (trunc Int64Regs:$a)), + (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>; + +// truncate i32 +def : Pat<(i16 (trunc Int32Regs:$a)), + (CVT_u16_u32 Int32Regs:$a, CvtNONE)>; +def : Pat<(i1 (trunc Int32Regs:$a)), + (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>; + +// truncate i16 +def : Pat<(i1 (trunc Int16Regs:$a)), + (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>; + +// sext_inreg +def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>; +def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>; +def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>; +def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>; +def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>; +def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>; + + +// Select instructions with 32-bit predicates +def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b), + (SELP_b16rr Int16Regs:$a, Int16Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b), + (SELP_b32rr Int32Regs:$a, Int32Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b), + (SELP_b64rr Int64Regs:$a, Int64Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b), + (SELP_f16rr Float16Regs:$a, Float16Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b), + (SELP_f32rr Float32Regs:$a, Float32Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; +def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b), + (SELP_f64rr Float64Regs:$a, Float64Regs:$b, + (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>; + + +let hasSideEffects = 0 in { + // pack a set of smaller int registers to a larger int register + def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), + (ins Int16Regs:$s1, Int16Regs:$s2, + Int16Regs:$s3, Int16Regs:$s4), + "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>; + def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), + (ins Int16Regs:$s1, Int16Regs:$s2), + "mov.b32 \t$d, {{$s1, $s2}};", []>; + def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), + (ins Int32Regs:$s1, Int32Regs:$s2), + "mov.b64 \t$d, {{$s1, $s2}};", []>; + def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), + (ins Float32Regs:$s1, Float32Regs:$s2), + "mov.b64 \t$d, {{$s1, $s2}};", []>; + + // unpack a larger int register to a set of smaller int registers + def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, + Int16Regs:$d3, Int16Regs:$d4), + (ins Int64Regs:$s), + "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>; + def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), + (ins Int32Regs:$s), + "mov.b32 \t{{$d1, $d2}}, $s;", []>; + def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), + (ins Int64Regs:$s), + "mov.b64 \t{{$d1, $d2}}, $s;", []>; + def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), + (ins Float64Regs:$s), + "mov.b64 \t{{$d1, $d2}}, $s;", []>; + +} + +let hasSideEffects = 0 in { + // Extract element of f16x2 register. PTX does not provide any way + // to access elements of f16x2 vector directly, so we need to + // extract it using a temporary register. + def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst), + (ins Float16x2Regs:$src), + "{{ .reg .b16 \t%tmp_hi;\n\t" + " mov.b32 \t{$dst, %tmp_hi}, $src; }}", + [(set Float16Regs:$dst, + (extractelt (v2f16 Float16x2Regs:$src), 0))]>; + def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst), + (ins Float16x2Regs:$src), + "{{ .reg .b16 \t%tmp_lo;\n\t" + " mov.b32 \t{%tmp_lo, $dst}, $src; }}", + [(set Float16Regs:$dst, + (extractelt (v2f16 Float16x2Regs:$src), 1))]>; + + // Coalesce two f16 registers into f16x2 + def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst), + (ins Float16Regs:$a, Float16Regs:$b), + "mov.b32 \t$dst, {{$a, $b}};", + [(set Float16x2Regs:$dst, + (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>; + + // Directly initializing underlying the b32 register is one less SASS + // instruction than than vector-packing move. + def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src), + "mov.b32 \t$dst, $src;", + []>; + + // Split f16x2 into two f16 registers. + def SplitF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), + (ins Float16x2Regs:$src), + "mov.b32 \t{{$lo, $hi}}, $src;", + []>; + // Split an i32 into two f16 + def SplitI32toF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi), + (ins Int32Regs:$src), + "mov.b32 \t{{$lo, $hi}}, $src;", + []>; +} + +// Count leading zeros +let hasSideEffects = 0 in { + def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), + "clz.b32 \t$d, $a;", []>; + def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "clz.b64 \t$d, $a;", []>; +} + +// 32-bit has a direct PTX instruction +def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>; + +// The return type of the ctlz ISD node is the same as its input, but the PTX +// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the +// ptx value to 64 bits to match the ISD node's semantics, unless we know we're +// truncating back down to 32 bits. +def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; +def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; + +// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the +// result back to 16-bits if necessary. We also need to subtract 16 because +// the high-order 16 zeros were counted. +// +// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could +// use to save one SASS instruction (on sm_35 anyway): +// +// mov.b32 $tmp, {0xffff, $a} +// ctlz.b32 $result, $tmp +// +// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" +// and then ctlz that value. This way we don't have to subtract 16 from the +// result. Unfortunately today we don't have a way to generate +// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. +def : Pat<(ctlz Int16Regs:$a), + (SUBi16ri (CVT_u16_u32 + (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; +def : Pat<(i32 (zext (ctlz Int16Regs:$a))), + (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; + +// Population count +let hasSideEffects = 0 in { + def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), + "popc.b32 \t$d, $a;", []>; + def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "popc.b64 \t$d, $a;", []>; +} + +// 32-bit has a direct PTX instruction +def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>; + +// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit +// to match the LLVM semantics. Just as with ctlz.i64, we provide a second +// pattern that avoids the type conversion if we're truncating the result to +// i32 anyway. +def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; +def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; + +// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. +// If we know that we're storing into an i32, we can avoid the final trunc. +def : Pat<(ctpop Int16Regs:$a), + (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; +def : Pat<(i32 (zext (ctpop Int16Regs:$a))), + (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; + +// fpround f32 -> f16 +def : Pat<(f16 (fpround Float32Regs:$a)), + (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f16 (fpround Float32Regs:$a)), + (CVT_f16_f32 Float32Regs:$a, CvtRN)>; + +// fpround f64 -> f16 +def : Pat<(f16 (fpround Float64Regs:$a)), + (CVT_f16_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f16 (fpround Float64Regs:$a)), + (CVT_f16_f64 Float64Regs:$a, CvtRN)>; + +// fpround f64 -> f32 +def : Pat<(f32 (fpround Float64Regs:$a)), + (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpround Float64Regs:$a)), + (CVT_f32_f64 Float64Regs:$a, CvtRN)>; + +// fpextend f16 -> f32 +def : Pat<(f32 (fpextend Float16Regs:$a)), + (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpextend Float16Regs:$a)), + (CVT_f32_f16 Float16Regs:$a, CvtNONE)>; + +// fpextend f16 -> f64 +def : Pat<(f64 (fpextend Float16Regs:$a)), + (CVT_f64_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f64 (fpextend Float16Regs:$a)), + (CVT_f64_f16 Float16Regs:$a, CvtNONE)>; + +// fpextend f32 -> f64 +def : Pat<(f64 (fpextend Float32Regs:$a)), + (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f64 (fpextend Float32Regs:$a)), + (CVT_f64_f32 Float32Regs:$a, CvtNONE)>; + +def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; + +// fceil, ffloor, fround, ftrunc. + +def : Pat<(fceil Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(fceil Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>; +def : Pat<(fceil Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(fceil Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>; +def : Pat<(fceil Float64Regs:$a), + (CVT_f64_f64 Float64Regs:$a, CvtRPI)>; + +def : Pat<(ffloor Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(ffloor Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>; +def : Pat<(ffloor Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(ffloor Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>; +def : Pat<(ffloor Float64Regs:$a), + (CVT_f64_f64 Float64Regs:$a, CvtRMI)>; + +def : Pat<(fround Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f16 (fround Float16Regs:$a)), + (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(fround Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fround Float32Regs:$a)), + (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(f64 (fround Float64Regs:$a)), + (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; + +def : Pat<(ftrunc Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(ftrunc Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>; +def : Pat<(ftrunc Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(ftrunc Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>; +def : Pat<(ftrunc Float64Regs:$a), + (CVT_f64_f64 Float64Regs:$a, CvtRZI)>; + +// nearbyint and rint are implemented as rounding to nearest even. This isn't +// strictly correct, because it causes us to ignore the rounding mode. But it +// matches what CUDA's "libm" does. + +def : Pat<(fnearbyint Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(fnearbyint Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(fnearbyint Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(fnearbyint Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(fnearbyint Float64Regs:$a), + (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; + +def : Pat<(frint Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(frint Float16Regs:$a), + (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(frint Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(frint Float32Regs:$a), + (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>; +def : Pat<(frint Float64Regs:$a), + (CVT_f64_f64 Float64Regs:$a, CvtRNI)>; + + +//----------------------------------- +// Control-flow +//----------------------------------- + +let isTerminator=1 in { + let isReturn=1, isBarrier=1 in + def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>; + + let isBranch=1 in + def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), + "@$a bra \t$target;", + [(brcond Int1Regs:$a, bb:$target)]>; + let isBranch=1 in + def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), + "@!$a bra \t$target;", []>; + + let isBranch=1, isBarrier=1 in + def GOTO : NVPTXInst<(outs), (ins brtarget:$target), + "bra.uni \t$target;", [(br bb:$target)]>; +} + +def : Pat<(brcond Int32Regs:$a, bb:$target), + (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>; + +// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a +// conditional branch if the target block is the next block so that the code +// can fall through to the target block. The invertion is done by 'xor +// condition, 1', which will be translated to (setne condition, -1). Since ptx +// supports '@!pred bra target', we should use it. +def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), + (CBranchOther Int1Regs:$a, bb:$target)>; + +// Call +def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; + +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPSideEffect]>; + +def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; +def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def calltarget : Operand; +let isCall=1 in { + def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>; +} + +def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>; +def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>; + +// Pseudo instructions. +class Pseudo pattern> + : NVPTXInst; + +def Callseq_Start : + NVPTXInst<(outs), (ins i32imm:$amt), + "\\{ // callseq $amt\n" + "\t.reg .b32 temp_param_reg;", + [(callseq_start timm:$amt)]>; +def Callseq_End : + NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\\} // callseq $amt1", + [(callseq_end timm:$amt1, timm:$amt2)]>; + +// trap instruction +def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>; + +// Call prototype wrapper +def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def CallPrototype : + SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, + [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; +def ProtoIdent : Operand { + let PrintMethod = "printProtoIdent"; +} +def CALL_PROTOTYPE : + NVPTXInst<(outs), (ins ProtoIdent:$ident), + "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; + + +include "NVPTXIntrinsics.td" + + +//----------------------------------- +// Notes +//----------------------------------- +// BSWAP is currently expanded. The following is a more efficient +// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register +// - for sm_20, use pmpt (use vector scalar mov to get the pack and +// unpack). sm_20 supports native 32-bit register, but not native 16-bit +// register.