Index: include/llvm/CodeGen/SelectionDAG.h
===================================================================
--- include/llvm/CodeGen/SelectionDAG.h
+++ include/llvm/CodeGen/SelectionDAG.h
@@ -1128,6 +1128,11 @@
   /// Expand the specified \c ISD::VACOPY node as the Legalize pass would.
   SDValue expandVACopy(SDNode *Node);
 
+  /// Returs an GlobalAddress of the function from the current module with
+  /// name matching the given ExternalSymbol.
+  /// Panics the function doesn't exists.
+  SDValue getSymbolFunctionGlobalAddress(SDValue Op);
+
   /// *Mutate* the specified node in-place to have the
   /// specified operands.  If the resultant node already exists in the DAG,
   /// this does not modify the specified node, instead it returns the node that
Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8409,6 +8409,28 @@
   return TokenFactor;
 }
 
+SDValue SelectionDAG::getSymbolFunctionGlobalAddress(SDValue Op) {
+  assert(isa<ExternalSymbolSDNode>(Op) && "Node should be an ExternalSymbol");
+
+  auto *Symbol = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  auto *Module = MF->getFunction().getParent();
+  auto *Function = Module->getFunction(Symbol);
+
+  if (Function != nullptr) {
+    auto PtrTy = TLI->getPointerTy(getDataLayout(), Function->getAddressSpace());
+    return getGlobalAddress(Function, SDLoc(Op), PtrTy);
+  }
+
+  std::string ErrorStr;
+  raw_string_ostream ErrorFormatter(ErrorStr);
+
+  ErrorFormatter << "Undefined external symbol ";
+  ErrorFormatter << '"' << Symbol << '"';
+  ErrorFormatter.flush();
+
+  report_fatal_error(ErrorStr);
+}
+
 //===----------------------------------------------------------------------===//
 //                              SDNode Class
 //===----------------------------------------------------------------------===//
Index: lib/Target/NVPTX/CMakeLists.txt
===================================================================
--- lib/Target/NVPTX/CMakeLists.txt
+++ lib/Target/NVPTX/CMakeLists.txt
@@ -32,6 +32,7 @@
   NVPTXUtilities.cpp
   NVVMIntrRange.cpp
   NVVMReflect.cpp
+  NVPTXProxyRegErasure.cpp
   )
 
 add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
Index: lib/Target/NVPTX/NVPTX.h
===================================================================
--- lib/Target/NVPTX/NVPTX.h
+++ lib/Target/NVPTX/NVPTX.h
@@ -53,6 +53,7 @@
 FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM);
 BasicBlockPass *createNVPTXLowerAllocaPass();
 MachineFunctionPass *createNVPTXPeephole();
+MachineFunctionPass *createNVPTXProxyRegErasurePass();
 
 Target &getTheNVPTXTarget32();
 Target &getTheNVPTXTarget64();
Index: lib/Target/NVPTX/NVPTXISelLowering.h
===================================================================
--- lib/Target/NVPTX/NVPTXISelLowering.h
+++ lib/Target/NVPTX/NVPTXISelLowering.h
@@ -51,6 +51,7 @@
   CallSeqBegin,
   CallSeqEnd,
   CallPrototype,
+  ProxyReg,
   FUN_SHFL_CLAMP,
   FUN_SHFR_CLAMP,
   MUL_WIDE_SIGNED,
Index: lib/Target/NVPTX/NVPTXISelLowering.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -663,6 +663,8 @@
     return "NVPTXISD::CallSeqEnd";
   case NVPTXISD::CallPrototype:
     return "NVPTXISD::CallPrototype";
+  case NVPTXISD::ProxyReg:
+    return "NVPTXISD::ProxyReg";
   case NVPTXISD::LoadV2:
     return "NVPTXISD::LoadV2";
   case NVPTXISD::LoadV4:
@@ -1666,6 +1668,11 @@
   // indirect calls but is always null for libcalls.
   bool isIndirectCall = !Func && CS;
 
+  if (isa<ExternalSymbolSDNode>(Callee)) {
+    // Try to find the callee in the current module.
+    Callee = DAG.getSymbolFunctionGlobalAddress(Callee);
+  }
+
   if (isIndirectCall) {
     // This is indirect function call case : PTX requires a prototype of the
     // form
@@ -1738,6 +1745,9 @@
     InFlag = Chain.getValue(1);
   }
 
+  SmallVector<SDValue, 16> ProxyRegOps;
+  SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
+
   // Generate loads from param memory/moves from registers for result
   if (Ins.size() > 0) {
     SmallVector<EVT, 16> VTs;
@@ -1808,11 +1818,14 @@
             MachineMemOperand::MOLoad);
 
         for (unsigned j = 0; j < NumElts; ++j) {
-          SDValue Ret = RetVal.getValue(j);
+          ProxyRegOps.push_back(RetVal.getValue(j));
+
           if (needTruncate)
-            Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret);
-          InVals.push_back(Ret);
+            ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
+          else
+            ProxyRegTruncates.push_back(Optional<MVT>());
         }
+
         Chain = RetVal.getValue(NumElts);
         InFlag = RetVal.getValue(NumElts + 1);
 
@@ -1828,8 +1841,29 @@
                              DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
                                                    true),
                              InFlag, dl);
+  InFlag = Chain.getValue(1);
   uniqueCallSite++;
 
+  // Append ProxyReg instructions to the chain to make sure that `callseq_end`
+  // will not get lost. Otherwise, during libcalls expansion, the nodes can become
+  // dangling.
+  for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
+    SDValue Ret = DAG.getNode(
+      NVPTXISD::ProxyReg, dl,
+      DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
+      { Chain, ProxyRegOps[i], InFlag }
+    );
+
+    Chain = Ret.getValue(1);
+    InFlag = Ret.getValue(2);
+
+    if (ProxyRegTruncates[i].hasValue()) {
+      Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
+    }
+
+    InVals.push_back(Ret);
+  }
+
   // set isTailCall to false for now, until we figure out how to express
   // tail call optimization in PTX
   isTailCall = false;
Index: lib/Target/NVPTX/NVPTXInstrInfo.td
===================================================================
--- lib/Target/NVPTX/NVPTXInstrInfo.td
+++ lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1885,6 +1885,7 @@
 def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
 def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
 def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
+def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
 
 def DeclareParam :
   SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
@@ -1972,6 +1973,9 @@
 def RETURNNode :
   SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
          [SDNPHasChain, SDNPSideEffect]>;
+def ProxyReg :
+  SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 
 let mayLoad = 1 in {
   class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
@@ -2249,6 +2253,21 @@
 def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
 def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
 
+class ProxyRegInst<NVPTXRegClass regclass> :
+  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+            "// Proxy Register pseudo instruction",
+            [(set regclass:$dst, (ProxyReg regclass:$src))]>;
+
+let isCodeGenOnly=1, isPseudo=1 in {
+  def ProxyRegI1    : ProxyRegInst<Int1Regs>;
+  def ProxyRegI16   : ProxyRegInst<Int16Regs>;
+  def ProxyRegI32   : ProxyRegInst<Int32Regs>;
+  def ProxyRegI64   : ProxyRegInst<Int64Regs>;
+  def ProxyRegF16   : ProxyRegInst<Float16Regs>;
+  def ProxyRegF32   : ProxyRegInst<Float32Regs>;
+  def ProxyRegF64   : ProxyRegInst<Float64Regs>;
+  def ProxyRegF16x2 : ProxyRegInst<Float16x2Regs>;
+}
 
 //
 // Load / Store Handling
Index: lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
===================================================================
--- /dev/null
+++ lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
@@ -0,0 +1,114 @@
+//===- NVPTXProxyRegErasure.cpp - NVPTX Proxy Register Instruction Erasure -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The pass is needed to remove ProxyReg instructions and restore related
+// registers. The instructions were needed at instruction selection stage to
+// make sure that callseq_end nodes won't be removed as "dead nodes". This can
+// happen when we expand instructions into libcalls and the call site doesn't
+// care about the libcall chain. Call site cares about data flow only, and the
+// latest data flow node happens to be before callseq_end. Therefore the node
+// becomes dangling and "dead". The ProxyReg acts like an additional data flow
+// node *after* the callseq_end in the chain and ensures that everything will be
+// preserved.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+struct NVPTXProxyRegErasure : public MachineFunctionPass {
+public:
+  static char ID;
+  NVPTXProxyRegErasure() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "NVPTX Proxy Register Instruction Erasure";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  void replaceMachineInstructionUsage(MachineFunction &MF, MachineInstr &MI);
+
+  void replaceRegisterUsage(MachineInstr &Instr, MachineOperand &From,
+                            MachineOperand &To);
+};
+
+} // namespace
+
+char NVPTXProxyRegErasure::ID = 0;
+
+bool NVPTXProxyRegErasure::runOnMachineFunction(MachineFunction &MF) {
+  SmallVector<MachineInstr *, 16> RemoveList;
+
+  for (auto &BB : MF) {
+    for (auto &MI : BB) {
+      switch (MI.getOpcode()) {
+      case NVPTX::ProxyRegI1:
+      case NVPTX::ProxyRegI16:
+      case NVPTX::ProxyRegI32:
+      case NVPTX::ProxyRegI64:
+      case NVPTX::ProxyRegF16:
+      case NVPTX::ProxyRegF16x2:
+      case NVPTX::ProxyRegF32:
+      case NVPTX::ProxyRegF64:
+        replaceMachineInstructionUsage(MF, MI);
+        RemoveList.push_back(&MI);
+        break;
+      }
+    }
+  }
+
+  for (auto *MI : RemoveList) {
+    MI->eraseFromParent();
+  }
+
+  return !RemoveList.empty();
+}
+
+void NVPTXProxyRegErasure::replaceMachineInstructionUsage(MachineFunction &MF,
+                                                          MachineInstr &MI) {
+  auto &InOp = *MI.uses().begin();
+  auto &OutOp = *MI.defs().begin();
+
+  assert(InOp.isReg() && "ProxyReg input operand should be a register.");
+  assert(OutOp.isReg() && "ProxyReg output operand should be a register.");
+
+  for (auto &BB : MF) {
+    for (auto &I : BB) {
+      replaceRegisterUsage(I, OutOp, InOp);
+    }
+  }
+}
+
+void NVPTXProxyRegErasure::replaceRegisterUsage(MachineInstr &Instr,
+                                                MachineOperand &From,
+                                                MachineOperand &To) {
+  for (auto &Op : Instr.uses()) {
+    if (Op.isReg() && Op.getReg() == From.getReg()) {
+      Op.setReg(To.getReg());
+    }
+  }
+}
+
+MachineFunctionPass *llvm::createNVPTXProxyRegErasurePass() {
+  return new NVPTXProxyRegErasure();
+}
Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -160,6 +160,7 @@
 
   void addIRPasses() override;
   bool addInstSelector() override;
+  void addPreRegAlloc() override;
   void addPostRegAlloc() override;
   void addMachineSSAOptimization() override;
 
@@ -301,6 +302,11 @@
   return false;
 }
 
+void NVPTXPassConfig::addPreRegAlloc() {
+  // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive.
+  addPass(createNVPTXProxyRegErasurePass());
+}
+
 void NVPTXPassConfig::addPostRegAlloc() {
   addPass(createNVPTXPrologEpilogPass(), false);
   if (getOptLevel() != CodeGenOpt::None) {
Index: libcalls.patch
===================================================================
--- /dev/null
+++ libcalls.patch
@@ -0,0 +1,20322 @@
+commit 3dc3170ea13f4229731f50680c14d593e32ca7c6
+Author: Denys Zariaiev <denys.zariaiev@gmail.com>
+Date:   Mon Dec 17 00:03:05 2018 +0100
+
+    NVPTX: libcalls support
+
+diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
+index 8093fdab549..2346c1c3433 100644
+--- a/include/llvm/CodeGen/SelectionDAG.h
++++ b/include/llvm/CodeGen/SelectionDAG.h
+@@ -1,1693 +1,1698 @@
+ //===- llvm/CodeGen/SelectionDAG.h - InstSelection DAG ----------*- C++ -*-===//
+ //
+ //                     The LLVM Compiler Infrastructure
+ //
+ // This file is distributed under the University of Illinois Open Source
+ // License. See LICENSE.TXT for details.
+ //
+ //===----------------------------------------------------------------------===//
+ //
+ // This file declares the SelectionDAG class, and transitively defines the
+ // SDNode class and subclasses.
+ //
+ //===----------------------------------------------------------------------===//
+ 
+ #ifndef LLVM_CODEGEN_SELECTIONDAG_H
+ #define LLVM_CODEGEN_SELECTIONDAG_H
+ 
+ #include "llvm/ADT/APFloat.h"
+ #include "llvm/ADT/APInt.h"
+ #include "llvm/ADT/ArrayRef.h"
+ #include "llvm/ADT/DenseMap.h"
+ #include "llvm/ADT/DenseSet.h"
+ #include "llvm/ADT/FoldingSet.h"
+ #include "llvm/ADT/SetVector.h"
+ #include "llvm/ADT/SmallVector.h"
+ #include "llvm/ADT/StringMap.h"
+ #include "llvm/ADT/ilist.h"
+ #include "llvm/ADT/iterator.h"
+ #include "llvm/ADT/iterator_range.h"
+ #include "llvm/Analysis/AliasAnalysis.h"
+ #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+ #include "llvm/CodeGen/DAGCombine.h"
+ #include "llvm/CodeGen/FunctionLoweringInfo.h"
+ #include "llvm/CodeGen/ISDOpcodes.h"
+ #include "llvm/CodeGen/MachineFunction.h"
+ #include "llvm/CodeGen/MachineMemOperand.h"
+ #include "llvm/CodeGen/SelectionDAGNodes.h"
+ #include "llvm/CodeGen/ValueTypes.h"
+ #include "llvm/IR/DebugLoc.h"
+ #include "llvm/IR/Instructions.h"
+ #include "llvm/IR/Metadata.h"
+ #include "llvm/Support/Allocator.h"
+ #include "llvm/Support/ArrayRecycler.h"
+ #include "llvm/Support/AtomicOrdering.h"
+ #include "llvm/Support/Casting.h"
+ #include "llvm/Support/CodeGen.h"
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Support/MachineValueType.h"
+ #include "llvm/Support/RecyclingAllocator.h"
+ #include <algorithm>
+ #include <cassert>
+ #include <cstdint>
+ #include <functional>
+ #include <map>
+ #include <string>
+ #include <tuple>
+ #include <utility>
+ #include <vector>
+ 
+ namespace llvm {
+ 
+ class BlockAddress;
+ class Constant;
+ class ConstantFP;
+ class ConstantInt;
+ class DataLayout;
+ struct fltSemantics;
+ class GlobalValue;
+ struct KnownBits;
+ class LLVMContext;
+ class MachineBasicBlock;
+ class MachineConstantPoolValue;
+ class MCSymbol;
+ class OptimizationRemarkEmitter;
+ class SDDbgValue;
+ class SDDbgLabel;
+ class SelectionDAG;
+ class SelectionDAGTargetInfo;
+ class TargetLibraryInfo;
+ class TargetLowering;
+ class TargetMachine;
+ class TargetSubtargetInfo;
+ class Value;
+ 
+ class SDVTListNode : public FoldingSetNode {
+   friend struct FoldingSetTrait<SDVTListNode>;
+ 
+   /// A reference to an Interned FoldingSetNodeID for this node.
+   /// The Allocator in SelectionDAG holds the data.
+   /// SDVTList contains all types which are frequently accessed in SelectionDAG.
+   /// The size of this list is not expected to be big so it won't introduce
+   /// a memory penalty.
+   FoldingSetNodeIDRef FastID;
+   const EVT *VTs;
+   unsigned int NumVTs;
+   /// The hash value for SDVTList is fixed, so cache it to avoid
+   /// hash calculation.
+   unsigned HashValue;
+ 
+ public:
+   SDVTListNode(const FoldingSetNodeIDRef ID, const EVT *VT, unsigned int Num) :
+       FastID(ID), VTs(VT), NumVTs(Num) {
+     HashValue = ID.ComputeHash();
+   }
+ 
+   SDVTList getSDVTList() {
+     SDVTList result = {VTs, NumVTs};
+     return result;
+   }
+ };
+ 
+ /// Specialize FoldingSetTrait for SDVTListNode
+ /// to avoid computing temp FoldingSetNodeID and hash value.
+ template<> struct FoldingSetTrait<SDVTListNode> : DefaultFoldingSetTrait<SDVTListNode> {
+   static void Profile(const SDVTListNode &X, FoldingSetNodeID& ID) {
+     ID = X.FastID;
+   }
+ 
+   static bool Equals(const SDVTListNode &X, const FoldingSetNodeID &ID,
+                      unsigned IDHash, FoldingSetNodeID &TempID) {
+     if (X.HashValue != IDHash)
+       return false;
+     return ID == X.FastID;
+   }
+ 
+   static unsigned ComputeHash(const SDVTListNode &X, FoldingSetNodeID &TempID) {
+     return X.HashValue;
+   }
+ };
+ 
+ template <> struct ilist_alloc_traits<SDNode> {
+   static void deleteNode(SDNode *) {
+     llvm_unreachable("ilist_traits<SDNode> shouldn't see a deleteNode call!");
+   }
+ };
+ 
+ /// Keeps track of dbg_value information through SDISel.  We do
+ /// not build SDNodes for these so as not to perturb the generated code;
+ /// instead the info is kept off to the side in this structure. Each SDNode may
+ /// have one or more associated dbg_value entries. This information is kept in
+ /// DbgValMap.
+ /// Byval parameters are handled separately because they don't use alloca's,
+ /// which busts the normal mechanism.  There is good reason for handling all
+ /// parameters separately:  they may not have code generated for them, they
+ /// should always go at the beginning of the function regardless of other code
+ /// motion, and debug info for them is potentially useful even if the parameter
+ /// is unused.  Right now only byval parameters are handled separately.
+ class SDDbgInfo {
+   BumpPtrAllocator Alloc;
+   SmallVector<SDDbgValue*, 32> DbgValues;
+   SmallVector<SDDbgValue*, 32> ByvalParmDbgValues;
+   SmallVector<SDDbgLabel*, 4> DbgLabels;
+   using DbgValMapType = DenseMap<const SDNode *, SmallVector<SDDbgValue *, 2>>;
+   DbgValMapType DbgValMap;
+ 
+ public:
+   SDDbgInfo() = default;
+   SDDbgInfo(const SDDbgInfo &) = delete;
+   SDDbgInfo &operator=(const SDDbgInfo &) = delete;
+ 
+   void add(SDDbgValue *V, const SDNode *Node, bool isParameter) {
+     if (isParameter) {
+       ByvalParmDbgValues.push_back(V);
+     } else     DbgValues.push_back(V);
+     if (Node)
+       DbgValMap[Node].push_back(V);
+   }
+ 
+   void add(SDDbgLabel *L) {
+     DbgLabels.push_back(L);
+   }
+ 
+   /// Invalidate all DbgValues attached to the node and remove
+   /// it from the Node-to-DbgValues map.
+   void erase(const SDNode *Node);
+ 
+   void clear() {
+     DbgValMap.clear();
+     DbgValues.clear();
+     ByvalParmDbgValues.clear();
+     DbgLabels.clear();
+     Alloc.Reset();
+   }
+ 
+   BumpPtrAllocator &getAlloc() { return Alloc; }
+ 
+   bool empty() const {
+     return DbgValues.empty() && ByvalParmDbgValues.empty() && DbgLabels.empty();
+   }
+ 
+   ArrayRef<SDDbgValue*> getSDDbgValues(const SDNode *Node) const {
+     auto I = DbgValMap.find(Node);
+     if (I != DbgValMap.end())
+       return I->second;
+     return ArrayRef<SDDbgValue*>();
+   }
+ 
+   using DbgIterator = SmallVectorImpl<SDDbgValue*>::iterator;
+   using DbgLabelIterator = SmallVectorImpl<SDDbgLabel*>::iterator;
+ 
+   DbgIterator DbgBegin() { return DbgValues.begin(); }
+   DbgIterator DbgEnd()   { return DbgValues.end(); }
+   DbgIterator ByvalParmDbgBegin() { return ByvalParmDbgValues.begin(); }
+   DbgIterator ByvalParmDbgEnd()   { return ByvalParmDbgValues.end(); }
+   DbgLabelIterator DbgLabelBegin() { return DbgLabels.begin(); }
+   DbgLabelIterator DbgLabelEnd()   { return DbgLabels.end(); }
+ };
+ 
+ void checkForCycles(const SelectionDAG *DAG, bool force = false);
+ 
+ /// This is used to represent a portion of an LLVM function in a low-level
+ /// Data Dependence DAG representation suitable for instruction selection.
+ /// This DAG is constructed as the first step of instruction selection in order
+ /// to allow implementation of machine specific optimizations
+ /// and code simplifications.
+ ///
+ /// The representation used by the SelectionDAG is a target-independent
+ /// representation, which has some similarities to the GCC RTL representation,
+ /// but is significantly more simple, powerful, and is a graph form instead of a
+ /// linear form.
+ ///
+ class SelectionDAG {
+   const TargetMachine &TM;
+   const SelectionDAGTargetInfo *TSI = nullptr;
+   const TargetLowering *TLI = nullptr;
+   const TargetLibraryInfo *LibInfo = nullptr;
+   MachineFunction *MF;
+   Pass *SDAGISelPass = nullptr;
+   LLVMContext *Context;
+   CodeGenOpt::Level OptLevel;
+ 
+   LegacyDivergenceAnalysis * DA = nullptr;
+   FunctionLoweringInfo * FLI = nullptr;
+ 
+   /// The function-level optimization remark emitter.  Used to emit remarks
+   /// whenever manipulating the DAG.
+   OptimizationRemarkEmitter *ORE;
+ 
+   /// The starting token.
+   SDNode EntryNode;
+ 
+   /// The root of the entire DAG.
+   SDValue Root;
+ 
+   /// A linked list of nodes in the current DAG.
+   ilist<SDNode> AllNodes;
+ 
+   /// The AllocatorType for allocating SDNodes. We use
+   /// pool allocation with recycling.
+   using NodeAllocatorType = RecyclingAllocator<BumpPtrAllocator, SDNode,
+                                                sizeof(LargestSDNode),
+                                                alignof(MostAlignedSDNode)>;
+ 
+   /// Pool allocation for nodes.
+   NodeAllocatorType NodeAllocator;
+ 
+   /// This structure is used to memoize nodes, automatically performing
+   /// CSE with existing nodes when a duplicate is requested.
+   FoldingSet<SDNode> CSEMap;
+ 
+   /// Pool allocation for machine-opcode SDNode operands.
+   BumpPtrAllocator OperandAllocator;
+   ArrayRecycler<SDUse> OperandRecycler;
+ 
+   /// Pool allocation for misc. objects that are created once per SelectionDAG.
+   BumpPtrAllocator Allocator;
+ 
+   /// Tracks dbg_value and dbg_label information through SDISel.
+   SDDbgInfo *DbgInfo;
+ 
+   uint16_t NextPersistentId = 0;
+ 
+ public:
+   /// Clients of various APIs that cause global effects on
+   /// the DAG can optionally implement this interface.  This allows the clients
+   /// to handle the various sorts of updates that happen.
+   ///
+   /// A DAGUpdateListener automatically registers itself with DAG when it is
+   /// constructed, and removes itself when destroyed in RAII fashion.
+   struct DAGUpdateListener {
+     DAGUpdateListener *const Next;
+     SelectionDAG &DAG;
+ 
+     explicit DAGUpdateListener(SelectionDAG &D)
+       : Next(D.UpdateListeners), DAG(D) {
+       DAG.UpdateListeners = this;
+     }
+ 
+     virtual ~DAGUpdateListener() {
+       assert(DAG.UpdateListeners == this &&
+              "DAGUpdateListeners must be destroyed in LIFO order");
+       DAG.UpdateListeners = Next;
+     }
+ 
+     /// The node N that was deleted and, if E is not null, an
+     /// equivalent node E that replaced it.
+     virtual void NodeDeleted(SDNode *N, SDNode *E);
+ 
+     /// The node N that was updated.
+     virtual void NodeUpdated(SDNode *N);
+   };
+ 
+   struct DAGNodeDeletedListener : public DAGUpdateListener {
+     std::function<void(SDNode *, SDNode *)> Callback;
+ 
+     DAGNodeDeletedListener(SelectionDAG &DAG,
+                            std::function<void(SDNode *, SDNode *)> Callback)
+         : DAGUpdateListener(DAG), Callback(std::move(Callback)) {}
+ 
+     void NodeDeleted(SDNode *N, SDNode *E) override { Callback(N, E); }
+   };
+ 
+   /// When true, additional steps are taken to
+   /// ensure that getConstant() and similar functions return DAG nodes that
+   /// have legal types. This is important after type legalization since
+   /// any illegally typed nodes generated after this point will not experience
+   /// type legalization.
+   bool NewNodesMustHaveLegalTypes = false;
+ 
+ private:
+   /// DAGUpdateListener is a friend so it can manipulate the listener stack.
+   friend struct DAGUpdateListener;
+ 
+   /// Linked list of registered DAGUpdateListener instances.
+   /// This stack is maintained by DAGUpdateListener RAII.
+   DAGUpdateListener *UpdateListeners = nullptr;
+ 
+   /// Implementation of setSubgraphColor.
+   /// Return whether we had to truncate the search.
+   bool setSubgraphColorHelper(SDNode *N, const char *Color,
+                               DenseSet<SDNode *> &visited,
+                               int level, bool &printed);
+ 
+   template <typename SDNodeT, typename... ArgTypes>
+   SDNodeT *newSDNode(ArgTypes &&... Args) {
+     return new (NodeAllocator.template Allocate<SDNodeT>())
+         SDNodeT(std::forward<ArgTypes>(Args)...);
+   }
+ 
+   /// Build a synthetic SDNodeT with the given args and extract its subclass
+   /// data as an integer (e.g. for use in a folding set).
+   ///
+   /// The args to this function are the same as the args to SDNodeT's
+   /// constructor, except the second arg (assumed to be a const DebugLoc&) is
+   /// omitted.
+   template <typename SDNodeT, typename... ArgTypes>
+   static uint16_t getSyntheticNodeSubclassData(unsigned IROrder,
+                                                ArgTypes &&... Args) {
+     // The compiler can reduce this expression to a constant iff we pass an
+     // empty DebugLoc.  Thankfully, the debug location doesn't have any bearing
+     // on the subclass data.
+     return SDNodeT(IROrder, DebugLoc(), std::forward<ArgTypes>(Args)...)
+         .getRawSubclassData();
+   }
+ 
+   template <typename SDNodeTy>
+   static uint16_t getSyntheticNodeSubclassData(unsigned Opc, unsigned Order,
+                                                 SDVTList VTs, EVT MemoryVT,
+                                                 MachineMemOperand *MMO) {
+     return SDNodeTy(Opc, Order, DebugLoc(), VTs, MemoryVT, MMO)
+          .getRawSubclassData();
+   }
+ 
+   void createOperands(SDNode *Node, ArrayRef<SDValue> Vals);
+ 
+   void removeOperands(SDNode *Node) {
+     if (!Node->OperandList)
+       return;
+     OperandRecycler.deallocate(
+         ArrayRecycler<SDUse>::Capacity::get(Node->NumOperands),
+         Node->OperandList);
+     Node->NumOperands = 0;
+     Node->OperandList = nullptr;
+   }
+   void CreateTopologicalOrder(std::vector<SDNode*>& Order);
+ public:
+   explicit SelectionDAG(const TargetMachine &TM, CodeGenOpt::Level);
+   SelectionDAG(const SelectionDAG &) = delete;
+   SelectionDAG &operator=(const SelectionDAG &) = delete;
+   ~SelectionDAG();
+ 
+   /// Prepare this SelectionDAG to process code in the given MachineFunction.
+   void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
+             Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
+             LegacyDivergenceAnalysis * Divergence);
+ 
+   void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) {
+     FLI = FuncInfo;
+   }
+ 
+   /// Clear state and free memory necessary to make this
+   /// SelectionDAG ready to process a new block.
+   void clear();
+ 
+   MachineFunction &getMachineFunction() const { return *MF; }
+   const Pass *getPass() const { return SDAGISelPass; }
+ 
+   const DataLayout &getDataLayout() const { return MF->getDataLayout(); }
+   const TargetMachine &getTarget() const { return TM; }
+   const TargetSubtargetInfo &getSubtarget() const { return MF->getSubtarget(); }
+   const TargetLowering &getTargetLoweringInfo() const { return *TLI; }
+   const TargetLibraryInfo &getLibInfo() const { return *LibInfo; }
+   const SelectionDAGTargetInfo &getSelectionDAGInfo() const { return *TSI; }
+   LLVMContext *getContext() const {return Context; }
+   OptimizationRemarkEmitter &getORE() const { return *ORE; }
+ 
+   /// Pop up a GraphViz/gv window with the DAG rendered using 'dot'.
+   void viewGraph(const std::string &Title);
+   void viewGraph();
+ 
+ #ifndef NDEBUG
+   std::map<const SDNode *, std::string> NodeGraphAttrs;
+ #endif
+ 
+   /// Clear all previously defined node graph attributes.
+   /// Intended to be used from a debugging tool (eg. gdb).
+   void clearGraphAttrs();
+ 
+   /// Set graph attributes for a node. (eg. "color=red".)
+   void setGraphAttrs(const SDNode *N, const char *Attrs);
+ 
+   /// Get graph attributes for a node. (eg. "color=red".)
+   /// Used from getNodeAttributes.
+   const std::string getGraphAttrs(const SDNode *N) const;
+ 
+   /// Convenience for setting node color attribute.
+   void setGraphColor(const SDNode *N, const char *Color);
+ 
+   /// Convenience for setting subgraph color attribute.
+   void setSubgraphColor(SDNode *N, const char *Color);
+ 
+   using allnodes_const_iterator = ilist<SDNode>::const_iterator;
+ 
+   allnodes_const_iterator allnodes_begin() const { return AllNodes.begin(); }
+   allnodes_const_iterator allnodes_end() const { return AllNodes.end(); }
+ 
+   using allnodes_iterator = ilist<SDNode>::iterator;
+ 
+   allnodes_iterator allnodes_begin() { return AllNodes.begin(); }
+   allnodes_iterator allnodes_end() { return AllNodes.end(); }
+ 
+   ilist<SDNode>::size_type allnodes_size() const {
+     return AllNodes.size();
+   }
+ 
+   iterator_range<allnodes_iterator> allnodes() {
+     return make_range(allnodes_begin(), allnodes_end());
+   }
+   iterator_range<allnodes_const_iterator> allnodes() const {
+     return make_range(allnodes_begin(), allnodes_end());
+   }
+ 
+   /// Return the root tag of the SelectionDAG.
+   const SDValue &getRoot() const { return Root; }
+ 
+   /// Return the token chain corresponding to the entry of the function.
+   SDValue getEntryNode() const {
+     return SDValue(const_cast<SDNode *>(&EntryNode), 0);
+   }
+ 
+   /// Set the current root tag of the SelectionDAG.
+   ///
+   const SDValue &setRoot(SDValue N) {
+     assert((!N.getNode() || N.getValueType() == MVT::Other) &&
+            "DAG root value is not a chain!");
+     if (N.getNode())
+       checkForCycles(N.getNode(), this);
+     Root = N;
+     if (N.getNode())
+       checkForCycles(this);
+     return Root;
+   }
+ 
+ #ifndef NDEBUG
+   void VerifyDAGDiverence();
+ #endif
+ 
+   /// This iterates over the nodes in the SelectionDAG, folding
+   /// certain types of nodes together, or eliminating superfluous nodes.  The
+   /// Level argument controls whether Combine is allowed to produce nodes and
+   /// types that are illegal on the target.
+   void Combine(CombineLevel Level, AliasAnalysis *AA,
+                CodeGenOpt::Level OptLevel);
+ 
+   /// This transforms the SelectionDAG into a SelectionDAG that
+   /// only uses types natively supported by the target.
+   /// Returns "true" if it made any changes.
+   ///
+   /// Note that this is an involved process that may invalidate pointers into
+   /// the graph.
+   bool LegalizeTypes();
+ 
+   /// This transforms the SelectionDAG into a SelectionDAG that is
+   /// compatible with the target instruction selector, as indicated by the
+   /// TargetLowering object.
+   ///
+   /// Note that this is an involved process that may invalidate pointers into
+   /// the graph.
+   void Legalize();
+ 
+   /// Transforms a SelectionDAG node and any operands to it into a node
+   /// that is compatible with the target instruction selector, as indicated by
+   /// the TargetLowering object.
+   ///
+   /// \returns true if \c N is a valid, legal node after calling this.
+   ///
+   /// This essentially runs a single recursive walk of the \c Legalize process
+   /// over the given node (and its operands). This can be used to incrementally
+   /// legalize the DAG. All of the nodes which are directly replaced,
+   /// potentially including N, are added to the output parameter \c
+   /// UpdatedNodes so that the delta to the DAG can be understood by the
+   /// caller.
+   ///
+   /// When this returns false, N has been legalized in a way that make the
+   /// pointer passed in no longer valid. It may have even been deleted from the
+   /// DAG, and so it shouldn't be used further. When this returns true, the
+   /// N passed in is a legal node, and can be immediately processed as such.
+   /// This may still have done some work on the DAG, and will still populate
+   /// UpdatedNodes with any new nodes replacing those originally in the DAG.
+   bool LegalizeOp(SDNode *N, SmallSetVector<SDNode *, 16> &UpdatedNodes);
+ 
+   /// This transforms the SelectionDAG into a SelectionDAG
+   /// that only uses vector math operations supported by the target.  This is
+   /// necessary as a separate step from Legalize because unrolling a vector
+   /// operation can introduce illegal types, which requires running
+   /// LegalizeTypes again.
+   ///
+   /// This returns true if it made any changes; in that case, LegalizeTypes
+   /// is called again before Legalize.
+   ///
+   /// Note that this is an involved process that may invalidate pointers into
+   /// the graph.
+   bool LegalizeVectors();
+ 
+   /// This method deletes all unreachable nodes in the SelectionDAG.
+   void RemoveDeadNodes();
+ 
+   /// Remove the specified node from the system.  This node must
+   /// have no referrers.
+   void DeleteNode(SDNode *N);
+ 
+   /// Return an SDVTList that represents the list of values specified.
+   SDVTList getVTList(EVT VT);
+   SDVTList getVTList(EVT VT1, EVT VT2);
+   SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3);
+   SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4);
+   SDVTList getVTList(ArrayRef<EVT> VTs);
+ 
+   //===--------------------------------------------------------------------===//
+   // Node creation methods.
+ 
+   /// Create a ConstantSDNode wrapping a constant value.
+   /// If VT is a vector type, the constant is splatted into a BUILD_VECTOR.
+   ///
+   /// If only legal types can be produced, this does the necessary
+   /// transformations (e.g., if the vector element type is illegal).
+   /// @{
+   SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
+                       bool isTarget = false, bool isOpaque = false);
+   SDValue getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
+                       bool isTarget = false, bool isOpaque = false);
+ 
+   SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget = false,
+                              bool IsOpaque = false) {
+     return getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), DL,
+                        VT, IsTarget, IsOpaque);
+   }
+ 
+   SDValue getConstant(const ConstantInt &Val, const SDLoc &DL, EVT VT,
+                       bool isTarget = false, bool isOpaque = false);
+   SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL,
+                             bool isTarget = false);
+   SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT,
+                             bool isOpaque = false) {
+     return getConstant(Val, DL, VT, true, isOpaque);
+   }
+   SDValue getTargetConstant(const APInt &Val, const SDLoc &DL, EVT VT,
+                             bool isOpaque = false) {
+     return getConstant(Val, DL, VT, true, isOpaque);
+   }
+   SDValue getTargetConstant(const ConstantInt &Val, const SDLoc &DL, EVT VT,
+                             bool isOpaque = false) {
+     return getConstant(Val, DL, VT, true, isOpaque);
+   }
+ 
+   /// Create a true or false constant of type \p VT using the target's
+   /// BooleanContent for type \p OpVT.
+   SDValue getBoolConstant(bool V, const SDLoc &DL, EVT VT, EVT OpVT);
+   /// @}
+ 
+   /// Create a ConstantFPSDNode wrapping a constant value.
+   /// If VT is a vector type, the constant is splatted into a BUILD_VECTOR.
+   ///
+   /// If only legal types can be produced, this does the necessary
+   /// transformations (e.g., if the vector element type is illegal).
+   /// The forms that take a double should only be used for simple constants
+   /// that can be exactly represented in VT.  No checks are made.
+   /// @{
+   SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT,
+                         bool isTarget = false);
+   SDValue getConstantFP(const APFloat &Val, const SDLoc &DL, EVT VT,
+                         bool isTarget = false);
+   SDValue getConstantFP(const ConstantFP &V, const SDLoc &DL, EVT VT,
+                         bool isTarget = false);
+   SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT) {
+     return getConstantFP(Val, DL, VT, true);
+   }
+   SDValue getTargetConstantFP(const APFloat &Val, const SDLoc &DL, EVT VT) {
+     return getConstantFP(Val, DL, VT, true);
+   }
+   SDValue getTargetConstantFP(const ConstantFP &Val, const SDLoc &DL, EVT VT) {
+     return getConstantFP(Val, DL, VT, true);
+   }
+   /// @}
+ 
+   SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT,
+                            int64_t offset = 0, bool isTargetGA = false,
+                            unsigned char TargetFlags = 0);
+   SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT,
+                                  int64_t offset = 0,
+                                  unsigned char TargetFlags = 0) {
+     return getGlobalAddress(GV, DL, VT, offset, true, TargetFlags);
+   }
+   SDValue getFrameIndex(int FI, EVT VT, bool isTarget = false);
+   SDValue getTargetFrameIndex(int FI, EVT VT) {
+     return getFrameIndex(FI, VT, true);
+   }
+   SDValue getJumpTable(int JTI, EVT VT, bool isTarget = false,
+                        unsigned char TargetFlags = 0);
+   SDValue getTargetJumpTable(int JTI, EVT VT, unsigned char TargetFlags = 0) {
+     return getJumpTable(JTI, VT, true, TargetFlags);
+   }
+   SDValue getConstantPool(const Constant *C, EVT VT,
+                           unsigned Align = 0, int Offs = 0, bool isT=false,
+                           unsigned char TargetFlags = 0);
+   SDValue getTargetConstantPool(const Constant *C, EVT VT,
+                                 unsigned Align = 0, int Offset = 0,
+                                 unsigned char TargetFlags = 0) {
+     return getConstantPool(C, VT, Align, Offset, true, TargetFlags);
+   }
+   SDValue getConstantPool(MachineConstantPoolValue *C, EVT VT,
+                           unsigned Align = 0, int Offs = 0, bool isT=false,
+                           unsigned char TargetFlags = 0);
+   SDValue getTargetConstantPool(MachineConstantPoolValue *C,
+                                   EVT VT, unsigned Align = 0,
+                                   int Offset = 0, unsigned char TargetFlags=0) {
+     return getConstantPool(C, VT, Align, Offset, true, TargetFlags);
+   }
+   SDValue getTargetIndex(int Index, EVT VT, int64_t Offset = 0,
+                          unsigned char TargetFlags = 0);
+   // When generating a branch to a BB, we don't in general know enough
+   // to provide debug info for the BB at that time, so keep this one around.
+   SDValue getBasicBlock(MachineBasicBlock *MBB);
+   SDValue getBasicBlock(MachineBasicBlock *MBB, SDLoc dl);
+   SDValue getExternalSymbol(const char *Sym, EVT VT);
+   SDValue getExternalSymbol(const char *Sym, const SDLoc &dl, EVT VT);
+   SDValue getTargetExternalSymbol(const char *Sym, EVT VT,
+                                   unsigned char TargetFlags = 0);
+   SDValue getMCSymbol(MCSymbol *Sym, EVT VT);
+ 
+   SDValue getValueType(EVT);
+   SDValue getRegister(unsigned Reg, EVT VT);
+   SDValue getRegisterMask(const uint32_t *RegMask);
+   SDValue getEHLabel(const SDLoc &dl, SDValue Root, MCSymbol *Label);
+   SDValue getLabelNode(unsigned Opcode, const SDLoc &dl, SDValue Root,
+                        MCSymbol *Label);
+   SDValue getBlockAddress(const BlockAddress *BA, EVT VT,
+                           int64_t Offset = 0, bool isTarget = false,
+                           unsigned char TargetFlags = 0);
+   SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT,
+                                 int64_t Offset = 0,
+                                 unsigned char TargetFlags = 0) {
+     return getBlockAddress(BA, VT, Offset, true, TargetFlags);
+   }
+ 
+   SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg,
+                        SDValue N) {
+     return getNode(ISD::CopyToReg, dl, MVT::Other, Chain,
+                    getRegister(Reg, N.getValueType()), N);
+   }
+ 
+   // This version of the getCopyToReg method takes an extra operand, which
+   // indicates that there is potentially an incoming glue value (if Glue is not
+   // null) and that there should be a glue result.
+   SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N,
+                        SDValue Glue) {
+     SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
+     SDValue Ops[] = { Chain, getRegister(Reg, N.getValueType()), N, Glue };
+     return getNode(ISD::CopyToReg, dl, VTs,
+                    makeArrayRef(Ops, Glue.getNode() ? 4 : 3));
+   }
+ 
+   // Similar to last getCopyToReg() except parameter Reg is a SDValue
+   SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, SDValue Reg, SDValue N,
+                        SDValue Glue) {
+     SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
+     SDValue Ops[] = { Chain, Reg, N, Glue };
+     return getNode(ISD::CopyToReg, dl, VTs,
+                    makeArrayRef(Ops, Glue.getNode() ? 4 : 3));
+   }
+ 
+   SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT) {
+     SDVTList VTs = getVTList(VT, MVT::Other);
+     SDValue Ops[] = { Chain, getRegister(Reg, VT) };
+     return getNode(ISD::CopyFromReg, dl, VTs, Ops);
+   }
+ 
+   // This version of the getCopyFromReg method takes an extra operand, which
+   // indicates that there is potentially an incoming glue value (if Glue is not
+   // null) and that there should be a glue result.
+   SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT,
+                          SDValue Glue) {
+     SDVTList VTs = getVTList(VT, MVT::Other, MVT::Glue);
+     SDValue Ops[] = { Chain, getRegister(Reg, VT), Glue };
+     return getNode(ISD::CopyFromReg, dl, VTs,
+                    makeArrayRef(Ops, Glue.getNode() ? 3 : 2));
+   }
+ 
+   SDValue getCondCode(ISD::CondCode Cond);
+ 
+   /// Return an ISD::VECTOR_SHUFFLE node. The number of elements in VT,
+   /// which must be a vector type, must match the number of mask elements
+   /// NumElts. An integer mask element equal to -1 is treated as undefined.
+   SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2,
+                            ArrayRef<int> Mask);
+ 
+   /// Return an ISD::BUILD_VECTOR node. The number of elements in VT,
+   /// which must be a vector type, must match the number of operands in Ops.
+   /// The operands must have the same type as (or, for integers, a type wider
+   /// than) VT's element type.
+   SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef<SDValue> Ops) {
+     // VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
+     return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
+   }
+ 
+   /// Return an ISD::BUILD_VECTOR node. The number of elements in VT,
+   /// which must be a vector type, must match the number of operands in Ops.
+   /// The operands must have the same type as (or, for integers, a type wider
+   /// than) VT's element type.
+   SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef<SDUse> Ops) {
+     // VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
+     return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
+   }
+ 
+   /// Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all
+   /// elements. VT must be a vector type. Op's type must be the same as (or,
+   /// for integers, a type wider than) VT's element type.
+   SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op) {
+     // VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
+     if (Op.getOpcode() == ISD::UNDEF) {
+       assert((VT.getVectorElementType() == Op.getValueType() ||
+               (VT.isInteger() &&
+                VT.getVectorElementType().bitsLE(Op.getValueType()))) &&
+              "A splatted value must have a width equal or (for integers) "
+              "greater than the vector element type!");
+       return getNode(ISD::UNDEF, SDLoc(), VT);
+     }
+ 
+     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Op);
+     return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
+   }
+ 
+   /// Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to
+   /// the shuffle node in input but with swapped operands.
+   ///
+   /// Example: shuffle A, B, <0,5,2,7> -> shuffle B, A, <4,1,6,3>
+   SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV);
+ 
+   /// Convert Op, which must be of float type, to the
+   /// float type VT, by either extending or rounding (by truncation).
+   SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT);
+ 
+   /// Convert Op, which must be of integer type, to the
+   /// integer type VT, by either any-extending or truncating it.
+   SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);
+ 
+   /// Convert Op, which must be of integer type, to the
+   /// integer type VT, by either sign-extending or truncating it.
+   SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);
+ 
+   /// Convert Op, which must be of integer type, to the
+   /// integer type VT, by either zero-extending or truncating it.
+   SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);
+ 
+   /// Return the expression required to zero extend the Op
+   /// value assuming it was the smaller SrcTy value.
+   SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT);
+ 
+   /// Convert Op, which must be of integer type, to the integer type VT,
+   /// by using an extension appropriate for the target's
+   /// BooleanContent for type OpVT or truncating it.
+   SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT);
+ 
+   /// Create a bitwise NOT operation as (XOR Val, -1).
+   SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT);
+ 
+   /// Create a logical NOT operation as (XOR Val, BooleanOne).
+   SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT);
+ 
+   /// Create an add instruction with appropriate flags when used for
+   /// addressing some offset of an object. i.e. if a load is split into multiple
+   /// components, create an add nuw from the base pointer to the offset.
+   SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Op, int64_t Offset) {
+     EVT VT = Op.getValueType();
+     return getObjectPtrOffset(SL, Op, getConstant(Offset, SL, VT));
+   }
+ 
+   SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Op, SDValue Offset) {
+     EVT VT = Op.getValueType();
+ 
+     // The object itself can't wrap around the address space, so it shouldn't be
+     // possible for the adds of the offsets to the split parts to overflow.
+     SDNodeFlags Flags;
+     Flags.setNoUnsignedWrap(true);
+     return getNode(ISD::ADD, SL, VT, Op, Offset, Flags);
+   }
+ 
+   /// Return a new CALLSEQ_START node, that starts new call frame, in which
+   /// InSize bytes are set up inside CALLSEQ_START..CALLSEQ_END sequence and
+   /// OutSize specifies part of the frame set up prior to the sequence.
+   SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize,
+                            const SDLoc &DL) {
+     SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
+     SDValue Ops[] = { Chain,
+                       getIntPtrConstant(InSize, DL, true),
+                       getIntPtrConstant(OutSize, DL, true) };
+     return getNode(ISD::CALLSEQ_START, DL, VTs, Ops);
+   }
+ 
+   /// Return a new CALLSEQ_END node, which always must have a
+   /// glue result (to ensure it's not CSE'd).
+   /// CALLSEQ_END does not have a useful SDLoc.
+   SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2,
+                          SDValue InGlue, const SDLoc &DL) {
+     SDVTList NodeTys = getVTList(MVT::Other, MVT::Glue);
+     SmallVector<SDValue, 4> Ops;
+     Ops.push_back(Chain);
+     Ops.push_back(Op1);
+     Ops.push_back(Op2);
+     if (InGlue.getNode())
+       Ops.push_back(InGlue);
+     return getNode(ISD::CALLSEQ_END, DL, NodeTys, Ops);
+   }
+ 
+   /// Return true if the result of this operation is always undefined.
+   bool isUndef(unsigned Opcode, ArrayRef<SDValue> Ops);
+ 
+   /// Return an UNDEF node. UNDEF does not have a useful SDLoc.
+   SDValue getUNDEF(EVT VT) {
+     return getNode(ISD::UNDEF, SDLoc(), VT);
+   }
+ 
+   /// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
+   SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
+     return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);
+   }
+ 
+   /// Gets or creates the specified node.
+   ///
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                   ArrayRef<SDUse> Ops);
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                   ArrayRef<SDValue> Ops, const SDNodeFlags Flags = SDNodeFlags());
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, ArrayRef<EVT> ResultTys,
+                   ArrayRef<SDValue> Ops);
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                   ArrayRef<SDValue> Ops);
+ 
+   // Specialize based on number of operands.
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT);
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
+                   const SDNodeFlags Flags = SDNodeFlags());
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                   SDValue N2, const SDNodeFlags Flags = SDNodeFlags());
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                   SDValue N2, SDValue N3,
+                   const SDNodeFlags Flags = SDNodeFlags());
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                   SDValue N2, SDValue N3, SDValue N4);
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                   SDValue N2, SDValue N3, SDValue N4, SDValue N5);
+ 
+   // Specialize again based on number of operands for nodes with a VTList
+   // rather than a single VT.
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList);
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N);
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
+                   SDValue N2);
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
+                   SDValue N2, SDValue N3);
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
+                   SDValue N2, SDValue N3, SDValue N4);
+   SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
+                   SDValue N2, SDValue N3, SDValue N4, SDValue N5);
+ 
+   /// Compute a TokenFactor to force all the incoming stack arguments to be
+   /// loaded from the stack. This is used in tail call lowering to protect
+   /// stack arguments from being clobbered.
+   SDValue getStackArgumentTokenFactor(SDValue Chain);
+ 
+   SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
+                     SDValue Size, unsigned Align, bool isVol, bool AlwaysInline,
+                     bool isTailCall, MachinePointerInfo DstPtrInfo,
+                     MachinePointerInfo SrcPtrInfo);
+ 
+   SDValue getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
+                      SDValue Size, unsigned Align, bool isVol, bool isTailCall,
+                      MachinePointerInfo DstPtrInfo,
+                      MachinePointerInfo SrcPtrInfo);
+ 
+   SDValue getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
+                     SDValue Size, unsigned Align, bool isVol, bool isTailCall,
+                     MachinePointerInfo DstPtrInfo);
+ 
+   SDValue getAtomicMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                           unsigned DstAlign, SDValue Src, unsigned SrcAlign,
+                           SDValue Size, Type *SizeTy, unsigned ElemSz,
+                           bool isTailCall, MachinePointerInfo DstPtrInfo,
+                           MachinePointerInfo SrcPtrInfo);
+ 
+   SDValue getAtomicMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                            unsigned DstAlign, SDValue Src, unsigned SrcAlign,
+                            SDValue Size, Type *SizeTy, unsigned ElemSz,
+                            bool isTailCall, MachinePointerInfo DstPtrInfo,
+                            MachinePointerInfo SrcPtrInfo);
+ 
+   SDValue getAtomicMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                           unsigned DstAlign, SDValue Value, SDValue Size,
+                           Type *SizeTy, unsigned ElemSz, bool isTailCall,
+                           MachinePointerInfo DstPtrInfo);
+ 
+   /// Helper function to make it easier to build SetCC's if you just have an
+   /// ISD::CondCode instead of an SDValue.
+   SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS,
+                    ISD::CondCode Cond) {
+     assert(LHS.getValueType().isVector() == RHS.getValueType().isVector() &&
+            "Cannot compare scalars to vectors");
+     assert(LHS.getValueType().isVector() == VT.isVector() &&
+            "Cannot compare scalars to vectors");
+     assert(Cond != ISD::SETCC_INVALID &&
+            "Cannot create a setCC of an invalid node.");
+     return getNode(ISD::SETCC, DL, VT, LHS, RHS, getCondCode(Cond));
+   }
+ 
+   /// Helper function to make it easier to build Select's if you just have
+   /// operands and don't want to check for vector.
+   SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS,
+                     SDValue RHS) {
+     assert(LHS.getValueType() == RHS.getValueType() &&
+            "Cannot use select on differing types");
+     assert(VT.isVector() == LHS.getValueType().isVector() &&
+            "Cannot mix vectors and scalars");
+     auto Opcode = Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT;
+     return getNode(Opcode, DL, VT, Cond, LHS, RHS);
+   }
+ 
+   /// Helper function to make it easier to build SelectCC's if you just have an
+   /// ISD::CondCode instead of an SDValue.
+   SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True,
+                       SDValue False, ISD::CondCode Cond) {
+     return getNode(ISD::SELECT_CC, DL, True.getValueType(), LHS, RHS, True,
+                    False, getCondCode(Cond));
+   }
+ 
+   /// Try to simplify a select/vselect into 1 of its operands or a constant.
+   SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal);
+ 
+   /// Try to simplify a shift into 1 of its operands or a constant.
+   SDValue simplifyShift(SDValue X, SDValue Y);
+ 
+   /// VAArg produces a result and token chain, and takes a pointer
+   /// and a source value as input.
+   SDValue getVAArg(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
+                    SDValue SV, unsigned Align);
+ 
+   /// Gets a node for an atomic cmpxchg op. There are two
+   /// valid Opcodes. ISD::ATOMIC_CMO_SWAP produces the value loaded and a
+   /// chain result. ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS produces the value loaded,
+   /// a success flag (initially i1), and a chain.
+   SDValue getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                            SDVTList VTs, SDValue Chain, SDValue Ptr,
+                            SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo,
+                            unsigned Alignment, AtomicOrdering SuccessOrdering,
+                            AtomicOrdering FailureOrdering,
+                            SyncScope::ID SSID);
+   SDValue getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                            SDVTList VTs, SDValue Chain, SDValue Ptr,
+                            SDValue Cmp, SDValue Swp, MachineMemOperand *MMO);
+ 
+   /// Gets a node for an atomic op, produces result (if relevant)
+   /// and chain and takes 2 operands.
+   SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain,
+                     SDValue Ptr, SDValue Val, const Value *PtrVal,
+                     unsigned Alignment, AtomicOrdering Ordering,
+                     SyncScope::ID SSID);
+   SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain,
+                     SDValue Ptr, SDValue Val, MachineMemOperand *MMO);
+ 
+   /// Gets a node for an atomic op, produces result and chain and
+   /// takes 1 operand.
+   SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, EVT VT,
+                     SDValue Chain, SDValue Ptr, MachineMemOperand *MMO);
+ 
+   /// Gets a node for an atomic op, produces result and chain and takes N
+   /// operands.
+   SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                     SDVTList VTList, ArrayRef<SDValue> Ops,
+                     MachineMemOperand *MMO);
+ 
+   /// Creates a MemIntrinsicNode that may produce a
+   /// result and takes a list of operands. Opcode may be INTRINSIC_VOID,
+   /// INTRINSIC_W_CHAIN, or a target-specific opcode with a value not
+   /// less than FIRST_TARGET_MEMORY_OPCODE.
+   SDValue getMemIntrinsicNode(
+     unsigned Opcode, const SDLoc &dl, SDVTList VTList,
+     ArrayRef<SDValue> Ops, EVT MemVT,
+     MachinePointerInfo PtrInfo,
+     unsigned Align = 0,
+     MachineMemOperand::Flags Flags
+     = MachineMemOperand::MOLoad | MachineMemOperand::MOStore,
+     unsigned Size = 0);
+ 
+   SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList,
+                               ArrayRef<SDValue> Ops, EVT MemVT,
+                               MachineMemOperand *MMO);
+ 
+   /// Create a MERGE_VALUES node from the given operands.
+   SDValue getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl);
+ 
+   /// Loads are not normal binary operators: their result type is not
+   /// determined by their operands, and they produce a value AND a token chain.
+   ///
+   /// This function will set the MOLoad flag on MMOFlags, but you can set it if
+   /// you want.  The MOStore flag must not be set.
+   SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
+                   MachinePointerInfo PtrInfo, unsigned Alignment = 0,
+                   MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
+                   const AAMDNodes &AAInfo = AAMDNodes(),
+                   const MDNode *Ranges = nullptr);
+   SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
+                   MachineMemOperand *MMO);
+   SDValue
+   getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain,
+              SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT,
+              unsigned Alignment = 0,
+              MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
+              const AAMDNodes &AAInfo = AAMDNodes());
+   SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT,
+                      SDValue Chain, SDValue Ptr, EVT MemVT,
+                      MachineMemOperand *MMO);
+   SDValue getIndexedLoad(SDValue OrigLoad, const SDLoc &dl, SDValue Base,
+                          SDValue Offset, ISD::MemIndexedMode AM);
+   SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
+                   const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
+                   MachinePointerInfo PtrInfo, EVT MemVT, unsigned Alignment = 0,
+                   MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
+                   const AAMDNodes &AAInfo = AAMDNodes(),
+                   const MDNode *Ranges = nullptr);
+   SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
+                   const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
+                   EVT MemVT, MachineMemOperand *MMO);
+ 
+   /// Helper function to build ISD::STORE nodes.
+   ///
+   /// This function will set the MOStore flag on MMOFlags, but you can set it if
+   /// you want.  The MOLoad and MOInvariant flags must not be set.
+   SDValue
+   getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
+            MachinePointerInfo PtrInfo, unsigned Alignment = 0,
+            MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
+            const AAMDNodes &AAInfo = AAMDNodes());
+   SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
+                    MachineMemOperand *MMO);
+   SDValue
+   getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
+                 MachinePointerInfo PtrInfo, EVT SVT, unsigned Alignment = 0,
+                 MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
+                 const AAMDNodes &AAInfo = AAMDNodes());
+   SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                         SDValue Ptr, EVT SVT, MachineMemOperand *MMO);
+   SDValue getIndexedStore(SDValue OrigStore, const SDLoc &dl, SDValue Base,
+                           SDValue Offset, ISD::MemIndexedMode AM);
+ 
+   /// Returns sum of the base pointer and offset.
+   SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, const SDLoc &DL);
+ 
+   SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
+                         SDValue Mask, SDValue Src0, EVT MemVT,
+                         MachineMemOperand *MMO, ISD::LoadExtType,
+                         bool IsExpanding = false);
+   SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                          SDValue Ptr, SDValue Mask, EVT MemVT,
+                          MachineMemOperand *MMO, bool IsTruncating = false,
+                          bool IsCompressing = false);
+   SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
+                           ArrayRef<SDValue> Ops, MachineMemOperand *MMO);
+   SDValue getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
+                            ArrayRef<SDValue> Ops, MachineMemOperand *MMO);
+ 
+   /// Return (create a new or find existing) a target-specific node.
+   /// TargetMemSDNode should be derived class from MemSDNode.
+   template <class TargetMemSDNode>
+   SDValue getTargetMemSDNode(SDVTList VTs, ArrayRef<SDValue> Ops,
+                              const SDLoc &dl, EVT MemVT,
+                              MachineMemOperand *MMO);
+ 
+   /// Construct a node to track a Value* through the backend.
+   SDValue getSrcValue(const Value *v);
+ 
+   /// Return an MDNodeSDNode which holds an MDNode.
+   SDValue getMDNode(const MDNode *MD);
+ 
+   /// Return a bitcast using the SDLoc of the value operand, and casting to the
+   /// provided type. Use getNode to set a custom SDLoc.
+   SDValue getBitcast(EVT VT, SDValue V);
+ 
+   /// Return an AddrSpaceCastSDNode.
+   SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS,
+                            unsigned DestAS);
+ 
+   /// Return the specified value casted to
+   /// the target's desired shift amount type.
+   SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
+ 
+   /// Expand the specified \c ISD::VAARG node as the Legalize pass would.
+   SDValue expandVAArg(SDNode *Node);
+ 
+   /// Expand the specified \c ISD::VACOPY node as the Legalize pass would.
+   SDValue expandVACopy(SDNode *Node);
+ 
++  /// Returs an GlobalAddress of the function from the current module with
++  /// name matching the given ExternalSymbol.
++  /// Panics the function doesn't exists.
++  SDValue getSymbolFunctionGlobalAddress(SDValue Op);
++
+   /// *Mutate* the specified node in-place to have the
+   /// specified operands.  If the resultant node already exists in the DAG,
+   /// this does not modify the specified node, instead it returns the node that
+   /// already exists.  If the resultant node does not exist in the DAG, the
+   /// input node is returned.  As a degenerate case, if you specify the same
+   /// input operands as the node already has, the input node is returned.
+   SDNode *UpdateNodeOperands(SDNode *N, SDValue Op);
+   SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2);
+   SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                                SDValue Op3);
+   SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                                SDValue Op3, SDValue Op4);
+   SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                                SDValue Op3, SDValue Op4, SDValue Op5);
+   SDNode *UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops);
+ 
+   /// *Mutate* the specified machine node's memory references to the provided
+   /// list.
+   void setNodeMemRefs(MachineSDNode *N,
+                       ArrayRef<MachineMemOperand *> NewMemRefs);
+ 
+   // Propagates the change in divergence to users
+   void updateDivergence(SDNode * N);
+ 
+   /// These are used for target selectors to *mutate* the
+   /// specified node to have the specified return type, Target opcode, and
+   /// operands.  Note that target opcodes are stored as
+   /// ~TargetOpcode in the node opcode field.  The resultant node is returned.
+   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT);
+   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT, SDValue Op1);
+   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT,
+                        SDValue Op1, SDValue Op2);
+   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT,
+                        SDValue Op1, SDValue Op2, SDValue Op3);
+   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT,
+                        ArrayRef<SDValue> Ops);
+   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1, EVT VT2);
+   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
+                        EVT VT2, ArrayRef<SDValue> Ops);
+   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
+                        EVT VT2, EVT VT3, ArrayRef<SDValue> Ops);
+   SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
+                        EVT VT2, SDValue Op1);
+   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
+                        EVT VT2, SDValue Op1, SDValue Op2);
+   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, SDVTList VTs,
+                        ArrayRef<SDValue> Ops);
+ 
+   /// This *mutates* the specified node to have the specified
+   /// return type, opcode, and operands.
+   SDNode *MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs,
+                       ArrayRef<SDValue> Ops);
+ 
+   /// Mutate the specified strict FP node to its non-strict equivalent,
+   /// unlinking the node from its chain and dropping the metadata arguments.
+   /// The node must be a strict FP node.
+   SDNode *mutateStrictFPToFP(SDNode *Node);
+ 
+   /// These are used for target selectors to create a new node
+   /// with specified return type(s), MachineInstr opcode, and operands.
+   ///
+   /// Note that getMachineNode returns the resultant node.  If there is already
+   /// a node of the specified opcode and operands, it returns that node instead
+   /// of the current one.
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
+                                 SDValue Op1);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
+                                 SDValue Op1, SDValue Op2);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
+                                 SDValue Op1, SDValue Op2, SDValue Op3);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
+                                 ArrayRef<SDValue> Ops);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
+                                 EVT VT2, SDValue Op1, SDValue Op2);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
+                                 EVT VT2, SDValue Op1, SDValue Op2, SDValue Op3);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
+                                 EVT VT2, ArrayRef<SDValue> Ops);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
+                                 EVT VT2, EVT VT3, SDValue Op1, SDValue Op2);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
+                                 EVT VT2, EVT VT3, SDValue Op1, SDValue Op2,
+                                 SDValue Op3);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
+                                 EVT VT2, EVT VT3, ArrayRef<SDValue> Ops);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                 ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops);
+   MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, SDVTList VTs,
+                                 ArrayRef<SDValue> Ops);
+ 
+   /// A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
+   SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
+                                  SDValue Operand);
+ 
+   /// A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
+   SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
+                                 SDValue Operand, SDValue Subreg);
+ 
+   /// Get the specified node if it's already available, or else return NULL.
+   SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef<SDValue> Ops,
+                           const SDNodeFlags Flags = SDNodeFlags());
+ 
+   /// Creates a SDDbgValue node.
+   SDDbgValue *getDbgValue(DIVariable *Var, DIExpression *Expr, SDNode *N,
+                           unsigned R, bool IsIndirect, const DebugLoc &DL,
+                           unsigned O);
+ 
+   /// Creates a constant SDDbgValue node.
+   SDDbgValue *getConstantDbgValue(DIVariable *Var, DIExpression *Expr,
+                                   const Value *C, const DebugLoc &DL,
+                                   unsigned O);
+ 
+   /// Creates a FrameIndex SDDbgValue node.
+   SDDbgValue *getFrameIndexDbgValue(DIVariable *Var, DIExpression *Expr,
+                                     unsigned FI, bool IsIndirect,
+                                     const DebugLoc &DL, unsigned O);
+ 
+   /// Creates a VReg SDDbgValue node.
+   SDDbgValue *getVRegDbgValue(DIVariable *Var, DIExpression *Expr,
+                               unsigned VReg, bool IsIndirect,
+                               const DebugLoc &DL, unsigned O);
+ 
+   /// Creates a SDDbgLabel node.
+   SDDbgLabel *getDbgLabel(DILabel *Label, const DebugLoc &DL, unsigned O);
+ 
+   /// Transfer debug values from one node to another, while optionally
+   /// generating fragment expressions for split-up values. If \p InvalidateDbg
+   /// is set, debug values are invalidated after they are transferred.
+   void transferDbgValues(SDValue From, SDValue To, unsigned OffsetInBits = 0,
+                          unsigned SizeInBits = 0, bool InvalidateDbg = true);
+ 
+   /// Remove the specified node from the system. If any of its
+   /// operands then becomes dead, remove them as well. Inform UpdateListener
+   /// for each node deleted.
+   void RemoveDeadNode(SDNode *N);
+ 
+   /// This method deletes the unreachable nodes in the
+   /// given list, and any nodes that become unreachable as a result.
+   void RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes);
+ 
+   /// Modify anything using 'From' to use 'To' instead.
+   /// This can cause recursive merging of nodes in the DAG.  Use the first
+   /// version if 'From' is known to have a single result, use the second
+   /// if you have two nodes with identical results (or if 'To' has a superset
+   /// of the results of 'From'), use the third otherwise.
+   ///
+   /// These methods all take an optional UpdateListener, which (if not null) is
+   /// informed about nodes that are deleted and modified due to recursive
+   /// changes in the dag.
+   ///
+   /// These functions only replace all existing uses. It's possible that as
+   /// these replacements are being performed, CSE may cause the From node
+   /// to be given new uses. These new uses of From are left in place, and
+   /// not automatically transferred to To.
+   ///
+   void ReplaceAllUsesWith(SDValue From, SDValue To);
+   void ReplaceAllUsesWith(SDNode *From, SDNode *To);
+   void ReplaceAllUsesWith(SDNode *From, const SDValue *To);
+ 
+   /// Replace any uses of From with To, leaving
+   /// uses of other values produced by From.getNode() alone.
+   void ReplaceAllUsesOfValueWith(SDValue From, SDValue To);
+ 
+   /// Like ReplaceAllUsesOfValueWith, but for multiple values at once.
+   /// This correctly handles the case where
+   /// there is an overlap between the From values and the To values.
+   void ReplaceAllUsesOfValuesWith(const SDValue *From, const SDValue *To,
+                                   unsigned Num);
+ 
+   /// If an existing load has uses of its chain, create a token factor node with
+   /// that chain and the new memory node's chain and update users of the old
+   /// chain to the token factor. This ensures that the new memory node will have
+   /// the same relative memory dependency position as the old load. Returns the
+   /// new merged load chain.
+   SDValue makeEquivalentMemoryOrdering(LoadSDNode *Old, SDValue New);
+ 
+   /// Topological-sort the AllNodes list and a
+   /// assign a unique node id for each node in the DAG based on their
+   /// topological order. Returns the number of nodes.
+   unsigned AssignTopologicalOrder();
+ 
+   /// Move node N in the AllNodes list to be immediately
+   /// before the given iterator Position. This may be used to update the
+   /// topological ordering when the list of nodes is modified.
+   void RepositionNode(allnodes_iterator Position, SDNode *N) {
+     AllNodes.insert(Position, AllNodes.remove(N));
+   }
+ 
+   /// Returns an APFloat semantics tag appropriate for the given type. If VT is
+   /// a vector type, the element semantics are returned.
+   static const fltSemantics &EVTToAPFloatSemantics(EVT VT) {
+     switch (VT.getScalarType().getSimpleVT().SimpleTy) {
+     default: llvm_unreachable("Unknown FP format");
+     case MVT::f16:     return APFloat::IEEEhalf();
+     case MVT::f32:     return APFloat::IEEEsingle();
+     case MVT::f64:     return APFloat::IEEEdouble();
+     case MVT::f80:     return APFloat::x87DoubleExtended();
+     case MVT::f128:    return APFloat::IEEEquad();
+     case MVT::ppcf128: return APFloat::PPCDoubleDouble();
+     }
+   }
+ 
+   /// Add a dbg_value SDNode. If SD is non-null that means the
+   /// value is produced by SD.
+   void AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter);
+ 
+   /// Add a dbg_label SDNode.
+   void AddDbgLabel(SDDbgLabel *DB);
+ 
+   /// Get the debug values which reference the given SDNode.
+   ArrayRef<SDDbgValue*> GetDbgValues(const SDNode* SD) const {
+     return DbgInfo->getSDDbgValues(SD);
+   }
+ 
+ public:
+   /// Return true if there are any SDDbgValue nodes associated
+   /// with this SelectionDAG.
+   bool hasDebugValues() const { return !DbgInfo->empty(); }
+ 
+   SDDbgInfo::DbgIterator DbgBegin() { return DbgInfo->DbgBegin(); }
+   SDDbgInfo::DbgIterator DbgEnd()   { return DbgInfo->DbgEnd(); }
+ 
+   SDDbgInfo::DbgIterator ByvalParmDbgBegin() {
+     return DbgInfo->ByvalParmDbgBegin();
+   }
+ 
+   SDDbgInfo::DbgIterator ByvalParmDbgEnd()   {
+     return DbgInfo->ByvalParmDbgEnd();
+   }
+ 
+   SDDbgInfo::DbgLabelIterator DbgLabelBegin() {
+     return DbgInfo->DbgLabelBegin();
+   }
+   SDDbgInfo::DbgLabelIterator DbgLabelEnd() {
+     return DbgInfo->DbgLabelEnd();
+   }
+ 
+   /// To be invoked on an SDNode that is slated to be erased. This
+   /// function mirrors \c llvm::salvageDebugInfo.
+   void salvageDebugInfo(SDNode &N);
+ 
+   void dump() const;
+ 
+   /// Create a stack temporary, suitable for holding the specified value type.
+   /// If minAlign is specified, the slot size will have at least that alignment.
+   SDValue CreateStackTemporary(EVT VT, unsigned minAlign = 1);
+ 
+   /// Create a stack temporary suitable for holding either of the specified
+   /// value types.
+   SDValue CreateStackTemporary(EVT VT1, EVT VT2);
+ 
+   SDValue FoldSymbolOffset(unsigned Opcode, EVT VT,
+                            const GlobalAddressSDNode *GA,
+                            const SDNode *N2);
+ 
+   SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
+                                  SDNode *Cst1, SDNode *Cst2);
+ 
+   SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
+                                  const ConstantSDNode *Cst1,
+                                  const ConstantSDNode *Cst2);
+ 
+   SDValue FoldConstantVectorArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
+                                        ArrayRef<SDValue> Ops,
+                                        const SDNodeFlags Flags = SDNodeFlags());
+ 
+   /// Constant fold a setcc to true or false.
+   SDValue FoldSetCC(EVT VT, SDValue N1, SDValue N2, ISD::CondCode Cond,
+                     const SDLoc &dl);
+ 
+   /// See if the specified operand can be simplified with the knowledge that only
+   /// the bits specified by Mask are used.  If so, return the simpler operand,
+   /// otherwise return a null SDValue.
+   ///
+   /// (This exists alongside SimplifyDemandedBits because GetDemandedBits can
+   /// simplify nodes with multiple uses more aggressively.)
+   SDValue GetDemandedBits(SDValue V, const APInt &Mask);
+ 
+   /// Return true if the sign bit of Op is known to be zero.
+   /// We use this predicate to simplify operations downstream.
+   bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const;
+ 
+   /// Return true if 'Op & Mask' is known to be zero.  We
+   /// use this predicate to simplify operations downstream.  Op and Mask are
+   /// known to be the same type.
+   bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth = 0)
+     const;
+ 
+   /// Determine which bits of Op are known to be either zero or one and return
+   /// them in Known. For vectors, the known bits are those that are shared by
+   /// every vector element.
+   /// Targets can implement the computeKnownBitsForTargetNode method in the
+   /// TargetLowering class to allow target nodes to be understood.
+   KnownBits computeKnownBits(SDValue Op, unsigned Depth = 0) const;
+ 
+   /// Determine which bits of Op are known to be either zero or one and return
+   /// them in Known. The DemandedElts argument allows us to only collect the
+   /// known bits that are shared by the requested vector elements.
+   /// Targets can implement the computeKnownBitsForTargetNode method in the
+   /// TargetLowering class to allow target nodes to be understood.
+   KnownBits computeKnownBits(SDValue Op, const APInt &DemandedElts,
+                              unsigned Depth = 0) const;
+ 
+   /// \copydoc SelectionDAG::computeKnownBits(SDValue,unsigned)
+   void computeKnownBits(SDValue Op, KnownBits &Known,
+                         unsigned Depth = 0) const {
+     Known = computeKnownBits(Op, Depth);
+   }
+ 
+   /// \copydoc SelectionDAG::computeKnownBits(SDValue,const APInt&,unsigned)
+   void computeKnownBits(SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+                         unsigned Depth = 0) const {
+     Known = computeKnownBits(Op, DemandedElts, Depth);
+   }
+ 
+   /// Used to represent the possible overflow behavior of an operation.
+   /// Never: the operation cannot overflow.
+   /// Always: the operation will always overflow.
+   /// Sometime: the operation may or may not overflow.
+   enum OverflowKind {
+     OFK_Never,
+     OFK_Sometime,
+     OFK_Always,
+   };
+ 
+   /// Determine if the result of the addition of 2 node can overflow.
+   OverflowKind computeOverflowKind(SDValue N0, SDValue N1) const;
+ 
+   /// Test if the given value is known to have exactly one bit set. This differs
+   /// from computeKnownBits in that it doesn't necessarily determine which bit
+   /// is set.
+   bool isKnownToBeAPowerOfTwo(SDValue Val) const;
+ 
+   /// Return the number of times the sign bit of the register is replicated into
+   /// the other bits. We know that at least 1 bit is always equal to the sign
+   /// bit (itself), but other cases can give us information. For example,
+   /// immediately after an "SRA X, 2", we know that the top 3 bits are all equal
+   /// to each other, so we return 3. Targets can implement the
+   /// ComputeNumSignBitsForTarget method in the TargetLowering class to allow
+   /// target nodes to be understood.
+   unsigned ComputeNumSignBits(SDValue Op, unsigned Depth = 0) const;
+ 
+   /// Return the number of times the sign bit of the register is replicated into
+   /// the other bits. We know that at least 1 bit is always equal to the sign
+   /// bit (itself), but other cases can give us information. For example,
+   /// immediately after an "SRA X, 2", we know that the top 3 bits are all equal
+   /// to each other, so we return 3. The DemandedElts argument allows
+   /// us to only collect the minimum sign bits of the requested vector elements.
+   /// Targets can implement the ComputeNumSignBitsForTarget method in the
+   /// TargetLowering class to allow target nodes to be understood.
+   unsigned ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
+                               unsigned Depth = 0) const;
+ 
+   /// Return true if the specified operand is an ISD::ADD with a ConstantSDNode
+   /// on the right-hand side, or if it is an ISD::OR with a ConstantSDNode that
+   /// is guaranteed to have the same semantics as an ADD. This handles the
+   /// equivalence:
+   ///     X|Cst == X+Cst iff X&Cst = 0.
+   bool isBaseWithConstantOffset(SDValue Op) const;
+ 
+   /// Test whether the given SDValue is known to never be NaN. If \p SNaN is
+   /// true, returns if \p Op is known to never be a signaling NaN (it may still
+   /// be a qNaN).
+   bool isKnownNeverNaN(SDValue Op, bool SNaN = false, unsigned Depth = 0) const;
+ 
+   /// \returns true if \p Op is known to never be a signaling NaN.
+   bool isKnownNeverSNaN(SDValue Op, unsigned Depth = 0) const {
+     return isKnownNeverNaN(Op, true, Depth);
+   }
+ 
+   /// Test whether the given floating point SDValue is known to never be
+   /// positive or negative zero.
+   bool isKnownNeverZeroFloat(SDValue Op) const;
+ 
+   /// Test whether the given SDValue is known to contain non-zero value(s).
+   bool isKnownNeverZero(SDValue Op) const;
+ 
+   /// Test whether two SDValues are known to compare equal. This
+   /// is true if they are the same value, or if one is negative zero and the
+   /// other positive zero.
+   bool isEqualTo(SDValue A, SDValue B) const;
+ 
+   /// Return true if A and B have no common bits set. As an example, this can
+   /// allow an 'add' to be transformed into an 'or'.
+   bool haveNoCommonBitsSet(SDValue A, SDValue B) const;
+ 
+   /// Test whether \p V has a splatted value for all the demanded elements.
+   ///
+   /// On success \p UndefElts will indicate the elements that have UNDEF
+   /// values instead of the splat value, this is only guaranteed to be correct
+   /// for \p DemandedElts.
+   ///
+   /// NOTE: The function will return true for a demanded splat of UNDEF values.
+   bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts);
+ 
+   /// Test whether \p V has a splatted value.
+   bool isSplatValue(SDValue V, bool AllowUndefs = false);
+ 
+   /// Match a binop + shuffle pyramid that represents a horizontal reduction
+   /// over the elements of a vector starting from the EXTRACT_VECTOR_ELT node /p
+   /// Extract. The reduction must use one of the opcodes listed in /p
+   /// CandidateBinOps and on success /p BinOp will contain the matching opcode.
+   /// Returns the vector that is being reduced on, or SDValue() if a reduction
+   /// was not matched.
+   SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
+                               ArrayRef<ISD::NodeType> CandidateBinOps);
+ 
+   /// Utility function used by legalize and lowering to
+   /// "unroll" a vector operation by splitting out the scalars and operating
+   /// on each element individually.  If the ResNE is 0, fully unroll the vector
+   /// op. If ResNE is less than the width of the vector op, unroll up to ResNE.
+   /// If the  ResNE is greater than the width of the vector op, unroll the
+   /// vector op and fill the end of the resulting vector with UNDEFS.
+   SDValue UnrollVectorOp(SDNode *N, unsigned ResNE = 0);
+ 
+   /// Return true if loads are next to each other and can be
+   /// merged. Check that both are nonvolatile and if LD is loading
+   /// 'Bytes' bytes from a location that is 'Dist' units away from the
+   /// location that the 'Base' load is loading from.
+   bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base,
+                                       unsigned Bytes, int Dist) const;
+ 
+   /// Infer alignment of a load / store address. Return 0 if
+   /// it cannot be inferred.
+   unsigned InferPtrAlignment(SDValue Ptr) const;
+ 
+   /// Compute the VTs needed for the low/hi parts of a type
+   /// which is split (or expanded) into two not necessarily identical pieces.
+   std::pair<EVT, EVT> GetSplitDestVTs(const EVT &VT) const;
+ 
+   /// Split the vector with EXTRACT_SUBVECTOR using the provides
+   /// VTs and return the low/high part.
+   std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL,
+                                           const EVT &LoVT, const EVT &HiVT);
+ 
+   /// Split the vector with EXTRACT_SUBVECTOR and return the low/high part.
+   std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL) {
+     EVT LoVT, HiVT;
+     std::tie(LoVT, HiVT) = GetSplitDestVTs(N.getValueType());
+     return SplitVector(N, DL, LoVT, HiVT);
+   }
+ 
+   /// Split the node's operand with EXTRACT_SUBVECTOR and
+   /// return the low/high part.
+   std::pair<SDValue, SDValue> SplitVectorOperand(const SDNode *N, unsigned OpNo)
+   {
+     return SplitVector(N->getOperand(OpNo), SDLoc(N));
+   }
+ 
+   /// Append the extracted elements from Start to Count out of the vector Op
+   /// in Args. If Count is 0, all of the elements will be extracted.
+   void ExtractVectorElements(SDValue Op, SmallVectorImpl<SDValue> &Args,
+                              unsigned Start = 0, unsigned Count = 0);
+ 
+   /// Compute the default alignment value for the given type.
+   unsigned getEVTAlignment(EVT MemoryVT) const;
+ 
+   /// Test whether the given value is a constant int or similar node.
+   SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N);
+ 
+   /// Test whether the given value is a constant FP or similar node.
+   SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N);
+ 
+   /// \returns true if \p N is any kind of constant or build_vector of
+   /// constants, int or float. If a vector, it may not necessarily be a splat.
+   inline bool isConstantValueOfAnyType(SDValue N) {
+     return isConstantIntBuildVectorOrConstantInt(N) ||
+            isConstantFPBuildVectorOrConstantFP(N);
+   }
+ 
+ private:
+   void InsertNode(SDNode *N);
+   bool RemoveNodeFromCSEMaps(SDNode *N);
+   void AddModifiedNodeToCSEMaps(SDNode *N);
+   SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op, void *&InsertPos);
+   SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op1, SDValue Op2,
+                                void *&InsertPos);
+   SDNode *FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
+                                void *&InsertPos);
+   SDNode *UpdateSDLocOnMergeSDNode(SDNode *N, const SDLoc &loc);
+ 
+   void DeleteNodeNotInCSEMaps(SDNode *N);
+   void DeallocateNode(SDNode *N);
+ 
+   void allnodes_clear();
+ 
+   /// Look up the node specified by ID in CSEMap.  If it exists, return it.  If
+   /// not, return the insertion token that will make insertion faster.  This
+   /// overload is for nodes other than Constant or ConstantFP, use the other one
+   /// for those.
+   SDNode *FindNodeOrInsertPos(const FoldingSetNodeID &ID, void *&InsertPos);
+ 
+   /// Look up the node specified by ID in CSEMap.  If it exists, return it.  If
+   /// not, return the insertion token that will make insertion faster.  Performs
+   /// additional processing for constant nodes.
+   SDNode *FindNodeOrInsertPos(const FoldingSetNodeID &ID, const SDLoc &DL,
+                               void *&InsertPos);
+ 
+   /// List of non-single value types.
+   FoldingSet<SDVTListNode> VTListMap;
+ 
+   /// Maps to auto-CSE operations.
+   std::vector<CondCodeSDNode*> CondCodeNodes;
+ 
+   std::vector<SDNode*> ValueTypeNodes;
+   std::map<EVT, SDNode*, EVT::compareRawBits> ExtendedValueTypeNodes;
+   StringMap<SDNode*> ExternalSymbols;
+ 
+   std::map<std::pair<std::string, unsigned char>,SDNode*> TargetExternalSymbols;
+   DenseMap<MCSymbol *, SDNode *> MCSymbols;
+ };
+ 
+ template <> struct GraphTraits<SelectionDAG*> : public GraphTraits<SDNode*> {
+   using nodes_iterator = pointer_iterator<SelectionDAG::allnodes_iterator>;
+ 
+   static nodes_iterator nodes_begin(SelectionDAG *G) {
+     return nodes_iterator(G->allnodes_begin());
+   }
+ 
+   static nodes_iterator nodes_end(SelectionDAG *G) {
+     return nodes_iterator(G->allnodes_end());
+   }
+ };
+ 
+ template <class TargetMemSDNode>
+ SDValue SelectionDAG::getTargetMemSDNode(SDVTList VTs,
+                                          ArrayRef<SDValue> Ops,
+                                          const SDLoc &dl, EVT MemVT,
+                                          MachineMemOperand *MMO) {
+   /// Compose node ID and try to find an existing node.
+   FoldingSetNodeID ID;
+   unsigned Opcode =
+     TargetMemSDNode(dl.getIROrder(), DebugLoc(), VTs, MemVT, MMO).getOpcode();
+   ID.AddInteger(Opcode);
+   ID.AddPointer(VTs.VTs);
+   for (auto& Op : Ops) {
+     ID.AddPointer(Op.getNode());
+     ID.AddInteger(Op.getResNo());
+   }
+   ID.AddInteger(MemVT.getRawBits());
+   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+   ID.AddInteger(getSyntheticNodeSubclassData<TargetMemSDNode>(
+     dl.getIROrder(), VTs, MemVT, MMO));
+ 
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+     cast<TargetMemSDNode>(E)->refineAlignment(MMO);
+     return SDValue(E, 0);
+   }
+ 
+   /// Existing node was not found. Create a new one.
+   auto *N = newSDNode<TargetMemSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                        MemVT, MMO);
+   createOperands(N, Ops);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ } // end namespace llvm
+ 
+ #endif // LLVM_CODEGEN_SELECTIONDAG_H
+diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+index ceb86898946..f892d9f61d4 100644
+--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
++++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+@@ -1,9196 +1,9218 @@
+ //===- SelectionDAG.cpp - Implement the SelectionDAG data structures ------===//
+ //
+ //                     The LLVM Compiler Infrastructure
+ //
+ // This file is distributed under the University of Illinois Open Source
+ // License. See LICENSE.TXT for details.
+ //
+ //===----------------------------------------------------------------------===//
+ //
+ // This implements the SelectionDAG class.
+ //
+ //===----------------------------------------------------------------------===//
+ 
+ #include "llvm/CodeGen/SelectionDAG.h"
+ #include "SDNodeDbgValue.h"
+ #include "llvm/ADT/APFloat.h"
+ #include "llvm/ADT/APInt.h"
+ #include "llvm/ADT/APSInt.h"
+ #include "llvm/ADT/ArrayRef.h"
+ #include "llvm/ADT/BitVector.h"
+ #include "llvm/ADT/FoldingSet.h"
+ #include "llvm/ADT/None.h"
+ #include "llvm/ADT/STLExtras.h"
+ #include "llvm/ADT/SmallPtrSet.h"
+ #include "llvm/ADT/SmallVector.h"
+ #include "llvm/ADT/Triple.h"
+ #include "llvm/ADT/Twine.h"
+ #include "llvm/Analysis/ValueTracking.h"
+ #include "llvm/CodeGen/ISDOpcodes.h"
+ #include "llvm/CodeGen/MachineBasicBlock.h"
+ #include "llvm/CodeGen/MachineConstantPool.h"
+ #include "llvm/CodeGen/MachineFrameInfo.h"
+ #include "llvm/CodeGen/MachineFunction.h"
+ #include "llvm/CodeGen/MachineMemOperand.h"
+ #include "llvm/CodeGen/RuntimeLibcalls.h"
+ #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
+ #include "llvm/CodeGen/SelectionDAGNodes.h"
+ #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+ #include "llvm/CodeGen/TargetLowering.h"
+ #include "llvm/CodeGen/TargetRegisterInfo.h"
+ #include "llvm/CodeGen/TargetSubtargetInfo.h"
+ #include "llvm/CodeGen/ValueTypes.h"
+ #include "llvm/IR/Constant.h"
+ #include "llvm/IR/Constants.h"
+ #include "llvm/IR/DataLayout.h"
+ #include "llvm/IR/DebugInfoMetadata.h"
+ #include "llvm/IR/DebugLoc.h"
+ #include "llvm/IR/DerivedTypes.h"
+ #include "llvm/IR/Function.h"
+ #include "llvm/IR/GlobalValue.h"
+ #include "llvm/IR/Metadata.h"
+ #include "llvm/IR/Type.h"
+ #include "llvm/IR/Value.h"
+ #include "llvm/Support/Casting.h"
+ #include "llvm/Support/CodeGen.h"
+ #include "llvm/Support/Compiler.h"
+ #include "llvm/Support/Debug.h"
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Support/KnownBits.h"
+ #include "llvm/Support/MachineValueType.h"
+ #include "llvm/Support/ManagedStatic.h"
+ #include "llvm/Support/MathExtras.h"
+ #include "llvm/Support/Mutex.h"
+ #include "llvm/Support/raw_ostream.h"
+ #include "llvm/Target/TargetMachine.h"
+ #include "llvm/Target/TargetOptions.h"
+ #include <algorithm>
+ #include <cassert>
+ #include <cstdint>
+ #include <cstdlib>
+ #include <limits>
+ #include <set>
+ #include <string>
+ #include <utility>
+ #include <vector>
+ 
+ using namespace llvm;
+ 
+ /// makeVTList - Return an instance of the SDVTList struct initialized with the
+ /// specified members.
+ static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
+   SDVTList Res = {VTs, NumVTs};
+   return Res;
+ }
+ 
+ // Default null implementations of the callbacks.
+ void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {}
+ void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
+ 
+ #define DEBUG_TYPE "selectiondag"
+ 
+ static cl::opt<bool> EnableMemCpyDAGOpt("enable-memcpy-dag-opt",
+        cl::Hidden, cl::init(true),
+        cl::desc("Gang up loads and stores generated by inlining of memcpy"));
+ 
+ static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
+        cl::desc("Number limit for gluing ld/st of memcpy."),
+        cl::Hidden, cl::init(0));
+ 
+ static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
+   LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
+ }
+ 
+ //===----------------------------------------------------------------------===//
+ //                              ConstantFPSDNode Class
+ //===----------------------------------------------------------------------===//
+ 
+ /// isExactlyValue - We don't rely on operator== working on double values, as
+ /// it returns true for things that are clearly not equal, like -0.0 and 0.0.
+ /// As such, this method can be used to do an exact bit-for-bit comparison of
+ /// two floating point values.
+ bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const {
+   return getValueAPF().bitwiseIsEqual(V);
+ }
+ 
+ bool ConstantFPSDNode::isValueValidForType(EVT VT,
+                                            const APFloat& Val) {
+   assert(VT.isFloatingPoint() && "Can only convert between FP types");
+ 
+   // convert modifies in place, so make a copy.
+   APFloat Val2 = APFloat(Val);
+   bool losesInfo;
+   (void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT),
+                       APFloat::rmNearestTiesToEven,
+                       &losesInfo);
+   return !losesInfo;
+ }
+ 
+ //===----------------------------------------------------------------------===//
+ //                              ISD Namespace
+ //===----------------------------------------------------------------------===//
+ 
+ bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
+   auto *BV = dyn_cast<BuildVectorSDNode>(N);
+   if (!BV)
+     return false;
+ 
+   APInt SplatUndef;
+   unsigned SplatBitSize;
+   bool HasUndefs;
+   unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
+   return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs,
+                              EltSize) &&
+          EltSize == SplatBitSize;
+ }
+ 
+ // FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be
+ // specializations of the more general isConstantSplatVector()?
+ 
+ bool ISD::isBuildVectorAllOnes(const SDNode *N) {
+   // Look through a bit convert.
+   while (N->getOpcode() == ISD::BITCAST)
+     N = N->getOperand(0).getNode();
+ 
+   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+ 
+   unsigned i = 0, e = N->getNumOperands();
+ 
+   // Skip over all of the undef values.
+   while (i != e && N->getOperand(i).isUndef())
+     ++i;
+ 
+   // Do not accept an all-undef vector.
+   if (i == e) return false;
+ 
+   // Do not accept build_vectors that aren't all constants or which have non-~0
+   // elements. We have to be a bit careful here, as the type of the constant
+   // may not be the same as the type of the vector elements due to type
+   // legalization (the elements are promoted to a legal type for the target and
+   // a vector of a type may be legal when the base element type is not).
+   // We only want to check enough bits to cover the vector elements, because
+   // we care if the resultant vector is all ones, not whether the individual
+   // constants are.
+   SDValue NotZero = N->getOperand(i);
+   unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
+   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) {
+     if (CN->getAPIntValue().countTrailingOnes() < EltSize)
+       return false;
+   } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(NotZero)) {
+     if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize)
+       return false;
+   } else
+     return false;
+ 
+   // Okay, we have at least one ~0 value, check to see if the rest match or are
+   // undefs. Even with the above element type twiddling, this should be OK, as
+   // the same type legalization should have applied to all the elements.
+   for (++i; i != e; ++i)
+     if (N->getOperand(i) != NotZero && !N->getOperand(i).isUndef())
+       return false;
+   return true;
+ }
+ 
+ bool ISD::isBuildVectorAllZeros(const SDNode *N) {
+   // Look through a bit convert.
+   while (N->getOpcode() == ISD::BITCAST)
+     N = N->getOperand(0).getNode();
+ 
+   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+ 
+   bool IsAllUndef = true;
+   for (const SDValue &Op : N->op_values()) {
+     if (Op.isUndef())
+       continue;
+     IsAllUndef = false;
+     // Do not accept build_vectors that aren't all constants or which have non-0
+     // elements. We have to be a bit careful here, as the type of the constant
+     // may not be the same as the type of the vector elements due to type
+     // legalization (the elements are promoted to a legal type for the target
+     // and a vector of a type may be legal when the base element type is not).
+     // We only want to check enough bits to cover the vector elements, because
+     // we care if the resultant vector is all zeros, not whether the individual
+     // constants are.
+     unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
+     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op)) {
+       if (CN->getAPIntValue().countTrailingZeros() < EltSize)
+         return false;
+     } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Op)) {
+       if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize)
+         return false;
+     } else
+       return false;
+   }
+ 
+   // Do not accept an all-undef vector.
+   if (IsAllUndef)
+     return false;
+   return true;
+ }
+ 
+ bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) {
+   if (N->getOpcode() != ISD::BUILD_VECTOR)
+     return false;
+ 
+   for (const SDValue &Op : N->op_values()) {
+     if (Op.isUndef())
+       continue;
+     if (!isa<ConstantSDNode>(Op))
+       return false;
+   }
+   return true;
+ }
+ 
+ bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) {
+   if (N->getOpcode() != ISD::BUILD_VECTOR)
+     return false;
+ 
+   for (const SDValue &Op : N->op_values()) {
+     if (Op.isUndef())
+       continue;
+     if (!isa<ConstantFPSDNode>(Op))
+       return false;
+   }
+   return true;
+ }
+ 
+ bool ISD::allOperandsUndef(const SDNode *N) {
+   // Return false if the node has no operands.
+   // This is "logically inconsistent" with the definition of "all" but
+   // is probably the desired behavior.
+   if (N->getNumOperands() == 0)
+     return false;
+ 
+   for (const SDValue &Op : N->op_values())
+     if (!Op.isUndef())
+       return false;
+ 
+   return true;
+ }
+ 
+ bool ISD::matchUnaryPredicate(SDValue Op,
+                               std::function<bool(ConstantSDNode *)> Match) {
+   if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
+     return Match(Cst);
+ 
+   if (ISD::BUILD_VECTOR != Op.getOpcode())
+     return false;
+ 
+   EVT SVT = Op.getValueType().getScalarType();
+   for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+     auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i));
+     if (!Cst || Cst->getValueType(0) != SVT || !Match(Cst))
+       return false;
+   }
+   return true;
+ }
+ 
+ bool ISD::matchBinaryPredicate(
+     SDValue LHS, SDValue RHS,
+     std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match) {
+   if (LHS.getValueType() != RHS.getValueType())
+     return false;
+ 
+   if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS))
+     if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS))
+       return Match(LHSCst, RHSCst);
+ 
+   if (ISD::BUILD_VECTOR != LHS.getOpcode() ||
+       ISD::BUILD_VECTOR != RHS.getOpcode())
+     return false;
+ 
+   EVT SVT = LHS.getValueType().getScalarType();
+   for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
+     auto *LHSCst = dyn_cast<ConstantSDNode>(LHS.getOperand(i));
+     auto *RHSCst = dyn_cast<ConstantSDNode>(RHS.getOperand(i));
+     if (!LHSCst || !RHSCst)
+       return false;
+     if (LHSCst->getValueType(0) != SVT ||
+         LHSCst->getValueType(0) != RHSCst->getValueType(0))
+       return false;
+     if (!Match(LHSCst, RHSCst))
+       return false;
+   }
+   return true;
+ }
+ 
+ ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
+   switch (ExtType) {
+   case ISD::EXTLOAD:
+     return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND;
+   case ISD::SEXTLOAD:
+     return ISD::SIGN_EXTEND;
+   case ISD::ZEXTLOAD:
+     return ISD::ZERO_EXTEND;
+   default:
+     break;
+   }
+ 
+   llvm_unreachable("Invalid LoadExtType");
+ }
+ 
+ ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
+   // To perform this operation, we just need to swap the L and G bits of the
+   // operation.
+   unsigned OldL = (Operation >> 2) & 1;
+   unsigned OldG = (Operation >> 1) & 1;
+   return ISD::CondCode((Operation & ~6) |  // Keep the N, U, E bits
+                        (OldL << 1) |       // New G bit
+                        (OldG << 2));       // New L bit.
+ }
+ 
+ ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
+   unsigned Operation = Op;
+   if (isInteger)
+     Operation ^= 7;   // Flip L, G, E bits, but not U.
+   else
+     Operation ^= 15;  // Flip all of the condition bits.
+ 
+   if (Operation > ISD::SETTRUE2)
+     Operation &= ~8;  // Don't let N and U bits get set.
+ 
+   return ISD::CondCode(Operation);
+ }
+ 
+ /// For an integer comparison, return 1 if the comparison is a signed operation
+ /// and 2 if the result is an unsigned comparison. Return zero if the operation
+ /// does not depend on the sign of the input (setne and seteq).
+ static int isSignedOp(ISD::CondCode Opcode) {
+   switch (Opcode) {
+   default: llvm_unreachable("Illegal integer setcc operation!");
+   case ISD::SETEQ:
+   case ISD::SETNE: return 0;
+   case ISD::SETLT:
+   case ISD::SETLE:
+   case ISD::SETGT:
+   case ISD::SETGE: return 1;
+   case ISD::SETULT:
+   case ISD::SETULE:
+   case ISD::SETUGT:
+   case ISD::SETUGE: return 2;
+   }
+ }
+ 
+ ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                        bool IsInteger) {
+   if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+     // Cannot fold a signed integer setcc with an unsigned integer setcc.
+     return ISD::SETCC_INVALID;
+ 
+   unsigned Op = Op1 | Op2;  // Combine all of the condition bits.
+ 
+   // If the N and U bits get set, then the resultant comparison DOES suddenly
+   // care about orderedness, and it is true when ordered.
+   if (Op > ISD::SETTRUE2)
+     Op &= ~16;     // Clear the U bit if the N bit is set.
+ 
+   // Canonicalize illegal integer setcc's.
+   if (IsInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
+     Op = ISD::SETNE;
+ 
+   return ISD::CondCode(Op);
+ }
+ 
+ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                         bool IsInteger) {
+   if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+     // Cannot fold a signed setcc with an unsigned setcc.
+     return ISD::SETCC_INVALID;
+ 
+   // Combine all of the condition bits.
+   ISD::CondCode Result = ISD::CondCode(Op1 & Op2);
+ 
+   // Canonicalize illegal integer setcc's.
+   if (IsInteger) {
+     switch (Result) {
+     default: break;
+     case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT
+     case ISD::SETOEQ:                                 // SETEQ  & SETU[LG]E
+     case ISD::SETUEQ: Result = ISD::SETEQ   ; break;  // SETUGE & SETULE
+     case ISD::SETOLT: Result = ISD::SETULT  ; break;  // SETULT & SETNE
+     case ISD::SETOGT: Result = ISD::SETUGT  ; break;  // SETUGT & SETNE
+     }
+   }
+ 
+   return Result;
+ }
+ 
+ //===----------------------------------------------------------------------===//
+ //                           SDNode Profile Support
+ //===----------------------------------------------------------------------===//
+ 
+ /// AddNodeIDOpcode - Add the node opcode to the NodeID data.
+ static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC)  {
+   ID.AddInteger(OpC);
+ }
+ 
+ /// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
+ /// solely with their pointer.
+ static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
+   ID.AddPointer(VTList.VTs);
+ }
+ 
+ /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+ static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                               ArrayRef<SDValue> Ops) {
+   for (auto& Op : Ops) {
+     ID.AddPointer(Op.getNode());
+     ID.AddInteger(Op.getResNo());
+   }
+ }
+ 
+ /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+ static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                               ArrayRef<SDUse> Ops) {
+   for (auto& Op : Ops) {
+     ID.AddPointer(Op.getNode());
+     ID.AddInteger(Op.getResNo());
+   }
+ }
+ 
+ static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
+                           SDVTList VTList, ArrayRef<SDValue> OpList) {
+   AddNodeIDOpcode(ID, OpC);
+   AddNodeIDValueTypes(ID, VTList);
+   AddNodeIDOperands(ID, OpList);
+ }
+ 
+ /// If this is an SDNode with special info, add this info to the NodeID data.
+ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
+   switch (N->getOpcode()) {
+   case ISD::TargetExternalSymbol:
+   case ISD::ExternalSymbol:
+   case ISD::MCSymbol:
+     llvm_unreachable("Should only be used on nodes with operands");
+   default: break;  // Normal nodes don't need extra info.
+   case ISD::TargetConstant:
+   case ISD::Constant: {
+     const ConstantSDNode *C = cast<ConstantSDNode>(N);
+     ID.AddPointer(C->getConstantIntValue());
+     ID.AddBoolean(C->isOpaque());
+     break;
+   }
+   case ISD::TargetConstantFP:
+   case ISD::ConstantFP:
+     ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
+     break;
+   case ISD::TargetGlobalAddress:
+   case ISD::GlobalAddress:
+   case ISD::TargetGlobalTLSAddress:
+   case ISD::GlobalTLSAddress: {
+     const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+     ID.AddPointer(GA->getGlobal());
+     ID.AddInteger(GA->getOffset());
+     ID.AddInteger(GA->getTargetFlags());
+     break;
+   }
+   case ISD::BasicBlock:
+     ID.AddPointer(cast<BasicBlockSDNode>(N)->getBasicBlock());
+     break;
+   case ISD::Register:
+     ID.AddInteger(cast<RegisterSDNode>(N)->getReg());
+     break;
+   case ISD::RegisterMask:
+     ID.AddPointer(cast<RegisterMaskSDNode>(N)->getRegMask());
+     break;
+   case ISD::SRCVALUE:
+     ID.AddPointer(cast<SrcValueSDNode>(N)->getValue());
+     break;
+   case ISD::FrameIndex:
+   case ISD::TargetFrameIndex:
+     ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
+     break;
+   case ISD::JumpTable:
+   case ISD::TargetJumpTable:
+     ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
+     ID.AddInteger(cast<JumpTableSDNode>(N)->getTargetFlags());
+     break;
+   case ISD::ConstantPool:
+   case ISD::TargetConstantPool: {
+     const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
+     ID.AddInteger(CP->getAlignment());
+     ID.AddInteger(CP->getOffset());
+     if (CP->isMachineConstantPoolEntry())
+       CP->getMachineCPVal()->addSelectionDAGCSEId(ID);
+     else
+       ID.AddPointer(CP->getConstVal());
+     ID.AddInteger(CP->getTargetFlags());
+     break;
+   }
+   case ISD::TargetIndex: {
+     const TargetIndexSDNode *TI = cast<TargetIndexSDNode>(N);
+     ID.AddInteger(TI->getIndex());
+     ID.AddInteger(TI->getOffset());
+     ID.AddInteger(TI->getTargetFlags());
+     break;
+   }
+   case ISD::LOAD: {
+     const LoadSDNode *LD = cast<LoadSDNode>(N);
+     ID.AddInteger(LD->getMemoryVT().getRawBits());
+     ID.AddInteger(LD->getRawSubclassData());
+     ID.AddInteger(LD->getPointerInfo().getAddrSpace());
+     break;
+   }
+   case ISD::STORE: {
+     const StoreSDNode *ST = cast<StoreSDNode>(N);
+     ID.AddInteger(ST->getMemoryVT().getRawBits());
+     ID.AddInteger(ST->getRawSubclassData());
+     ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+     break;
+   }
+   case ISD::MLOAD: {
+     const MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
+     ID.AddInteger(MLD->getMemoryVT().getRawBits());
+     ID.AddInteger(MLD->getRawSubclassData());
+     ID.AddInteger(MLD->getPointerInfo().getAddrSpace());
+     break;
+   }
+   case ISD::MSTORE: {
+     const MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+     ID.AddInteger(MST->getMemoryVT().getRawBits());
+     ID.AddInteger(MST->getRawSubclassData());
+     ID.AddInteger(MST->getPointerInfo().getAddrSpace());
+     break;
+   }
+   case ISD::MGATHER: {
+     const MaskedGatherSDNode *MG = cast<MaskedGatherSDNode>(N);
+     ID.AddInteger(MG->getMemoryVT().getRawBits());
+     ID.AddInteger(MG->getRawSubclassData());
+     ID.AddInteger(MG->getPointerInfo().getAddrSpace());
+     break;
+   }
+   case ISD::MSCATTER: {
+     const MaskedScatterSDNode *MS = cast<MaskedScatterSDNode>(N);
+     ID.AddInteger(MS->getMemoryVT().getRawBits());
+     ID.AddInteger(MS->getRawSubclassData());
+     ID.AddInteger(MS->getPointerInfo().getAddrSpace());
+     break;
+   }
+   case ISD::ATOMIC_CMP_SWAP:
+   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+   case ISD::ATOMIC_SWAP:
+   case ISD::ATOMIC_LOAD_ADD:
+   case ISD::ATOMIC_LOAD_SUB:
+   case ISD::ATOMIC_LOAD_AND:
+   case ISD::ATOMIC_LOAD_CLR:
+   case ISD::ATOMIC_LOAD_OR:
+   case ISD::ATOMIC_LOAD_XOR:
+   case ISD::ATOMIC_LOAD_NAND:
+   case ISD::ATOMIC_LOAD_MIN:
+   case ISD::ATOMIC_LOAD_MAX:
+   case ISD::ATOMIC_LOAD_UMIN:
+   case ISD::ATOMIC_LOAD_UMAX:
+   case ISD::ATOMIC_LOAD:
+   case ISD::ATOMIC_STORE: {
+     const AtomicSDNode *AT = cast<AtomicSDNode>(N);
+     ID.AddInteger(AT->getMemoryVT().getRawBits());
+     ID.AddInteger(AT->getRawSubclassData());
+     ID.AddInteger(AT->getPointerInfo().getAddrSpace());
+     break;
+   }
+   case ISD::PREFETCH: {
+     const MemSDNode *PF = cast<MemSDNode>(N);
+     ID.AddInteger(PF->getPointerInfo().getAddrSpace());
+     break;
+   }
+   case ISD::VECTOR_SHUFFLE: {
+     const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+     for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements();
+          i != e; ++i)
+       ID.AddInteger(SVN->getMaskElt(i));
+     break;
+   }
+   case ISD::TargetBlockAddress:
+   case ISD::BlockAddress: {
+     const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
+     ID.AddPointer(BA->getBlockAddress());
+     ID.AddInteger(BA->getOffset());
+     ID.AddInteger(BA->getTargetFlags());
+     break;
+   }
+   } // end switch (N->getOpcode())
+ 
+   // Target specific memory nodes could also have address spaces to check.
+   if (N->isTargetMemoryOpcode())
+     ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
+ }
+ 
+ /// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
+ /// data.
+ static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
+   AddNodeIDOpcode(ID, N->getOpcode());
+   // Add the return value info.
+   AddNodeIDValueTypes(ID, N->getVTList());
+   // Add the operand info.
+   AddNodeIDOperands(ID, N->ops());
+ 
+   // Handle SDNode leafs with special info.
+   AddNodeIDCustom(ID, N);
+ }
+ 
+ //===----------------------------------------------------------------------===//
+ //                              SelectionDAG Class
+ //===----------------------------------------------------------------------===//
+ 
+ /// doNotCSE - Return true if CSE should not be performed for this node.
+ static bool doNotCSE(SDNode *N) {
+   if (N->getValueType(0) == MVT::Glue)
+     return true; // Never CSE anything that produces a flag.
+ 
+   switch (N->getOpcode()) {
+   default: break;
+   case ISD::HANDLENODE:
+   case ISD::EH_LABEL:
+     return true;   // Never CSE these nodes.
+   }
+ 
+   // Check that remaining values produced are not flags.
+   for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
+     if (N->getValueType(i) == MVT::Glue)
+       return true; // Never CSE anything that produces a flag.
+ 
+   return false;
+ }
+ 
+ /// RemoveDeadNodes - This method deletes all unreachable nodes in the
+ /// SelectionDAG.
+ void SelectionDAG::RemoveDeadNodes() {
+   // Create a dummy node (which is not added to allnodes), that adds a reference
+   // to the root node, preventing it from being deleted.
+   HandleSDNode Dummy(getRoot());
+ 
+   SmallVector<SDNode*, 128> DeadNodes;
+ 
+   // Add all obviously-dead nodes to the DeadNodes worklist.
+   for (SDNode &Node : allnodes())
+     if (Node.use_empty())
+       DeadNodes.push_back(&Node);
+ 
+   RemoveDeadNodes(DeadNodes);
+ 
+   // If the root changed (e.g. it was a dead load, update the root).
+   setRoot(Dummy.getValue());
+ }
+ 
+ /// RemoveDeadNodes - This method deletes the unreachable nodes in the
+ /// given list, and any nodes that become unreachable as a result.
+ void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {
+ 
+   // Process the worklist, deleting the nodes and adding their uses to the
+   // worklist.
+   while (!DeadNodes.empty()) {
+     SDNode *N = DeadNodes.pop_back_val();
+     // Skip to next node if we've already managed to delete the node. This could
+     // happen if replacing a node causes a node previously added to the node to
+     // be deleted.
+     if (N->getOpcode() == ISD::DELETED_NODE)
+       continue;
+ 
+     for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+       DUL->NodeDeleted(N, nullptr);
+ 
+     // Take the node out of the appropriate CSE map.
+     RemoveNodeFromCSEMaps(N);
+ 
+     // Next, brutally remove the operand list.  This is safe to do, as there are
+     // no cycles in the graph.
+     for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+       SDUse &Use = *I++;
+       SDNode *Operand = Use.getNode();
+       Use.set(SDValue());
+ 
+       // Now that we removed this operand, see if there are no uses of it left.
+       if (Operand->use_empty())
+         DeadNodes.push_back(Operand);
+     }
+ 
+     DeallocateNode(N);
+   }
+ }
+ 
+ void SelectionDAG::RemoveDeadNode(SDNode *N){
+   SmallVector<SDNode*, 16> DeadNodes(1, N);
+ 
+   // Create a dummy node that adds a reference to the root node, preventing
+   // it from being deleted.  (This matters if the root is an operand of the
+   // dead node.)
+   HandleSDNode Dummy(getRoot());
+ 
+   RemoveDeadNodes(DeadNodes);
+ }
+ 
+ void SelectionDAG::DeleteNode(SDNode *N) {
+   // First take this out of the appropriate CSE map.
+   RemoveNodeFromCSEMaps(N);
+ 
+   // Finally, remove uses due to operands of this node, remove from the
+   // AllNodes list, and delete the node.
+   DeleteNodeNotInCSEMaps(N);
+ }
+ 
+ void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
+   assert(N->getIterator() != AllNodes.begin() &&
+          "Cannot delete the entry node!");
+   assert(N->use_empty() && "Cannot delete a node that is not dead!");
+ 
+   // Drop all of the operands and decrement used node's use counts.
+   N->DropOperands();
+ 
+   DeallocateNode(N);
+ }
+ 
+ void SDDbgInfo::erase(const SDNode *Node) {
+   DbgValMapType::iterator I = DbgValMap.find(Node);
+   if (I == DbgValMap.end())
+     return;
+   for (auto &Val: I->second)
+     Val->setIsInvalidated();
+   DbgValMap.erase(I);
+ }
+ 
+ void SelectionDAG::DeallocateNode(SDNode *N) {
+   // If we have operands, deallocate them.
+   removeOperands(N);
+ 
+   NodeAllocator.Deallocate(AllNodes.remove(N));
+ 
+   // Set the opcode to DELETED_NODE to help catch bugs when node
+   // memory is reallocated.
+   // FIXME: There are places in SDag that have grown a dependency on the opcode
+   // value in the released node.
+   __asan_unpoison_memory_region(&N->NodeType, sizeof(N->NodeType));
+   N->NodeType = ISD::DELETED_NODE;
+ 
+   // If any of the SDDbgValue nodes refer to this SDNode, invalidate
+   // them and forget about that node.
+   DbgInfo->erase(N);
+ }
+ 
+ #ifndef NDEBUG
+ /// VerifySDNode - Sanity check the given SDNode.  Aborts if it is invalid.
+ static void VerifySDNode(SDNode *N) {
+   switch (N->getOpcode()) {
+   default:
+     break;
+   case ISD::BUILD_PAIR: {
+     EVT VT = N->getValueType(0);
+     assert(N->getNumValues() == 1 && "Too many results!");
+     assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
+            "Wrong return type!");
+     assert(N->getNumOperands() == 2 && "Wrong number of operands!");
+     assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
+            "Mismatched operand types!");
+     assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
+            "Wrong operand type!");
+     assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
+            "Wrong return type size");
+     break;
+   }
+   case ISD::BUILD_VECTOR: {
+     assert(N->getNumValues() == 1 && "Too many results!");
+     assert(N->getValueType(0).isVector() && "Wrong return type!");
+     assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
+            "Wrong number of operands!");
+     EVT EltVT = N->getValueType(0).getVectorElementType();
+     for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
+       assert((I->getValueType() == EltVT ||
+              (EltVT.isInteger() && I->getValueType().isInteger() &&
+               EltVT.bitsLE(I->getValueType()))) &&
+             "Wrong operand type!");
+       assert(I->getValueType() == N->getOperand(0).getValueType() &&
+              "Operands must all have the same type");
+     }
+     break;
+   }
+   }
+ }
+ #endif // NDEBUG
+ 
+ /// Insert a newly allocated node into the DAG.
+ ///
+ /// Handles insertion into the all nodes list and CSE map, as well as
+ /// verification and other common operations when a new node is allocated.
+ void SelectionDAG::InsertNode(SDNode *N) {
+   AllNodes.push_back(N);
+ #ifndef NDEBUG
+   N->PersistentId = NextPersistentId++;
+   VerifySDNode(N);
+ #endif
+ }
+ 
+ /// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
+ /// correspond to it.  This is useful when we're about to delete or repurpose
+ /// the node.  We don't want future request for structurally identical nodes
+ /// to return N anymore.
+ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
+   bool Erased = false;
+   switch (N->getOpcode()) {
+   case ISD::HANDLENODE: return false;  // noop.
+   case ISD::CONDCODE:
+     assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
+            "Cond code doesn't exist!");
+     Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != nullptr;
+     CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = nullptr;
+     break;
+   case ISD::ExternalSymbol:
+     Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
+     break;
+   case ISD::TargetExternalSymbol: {
+     ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
+     Erased = TargetExternalSymbols.erase(
+                std::pair<std::string,unsigned char>(ESN->getSymbol(),
+                                                     ESN->getTargetFlags()));
+     break;
+   }
+   case ISD::MCSymbol: {
+     auto *MCSN = cast<MCSymbolSDNode>(N);
+     Erased = MCSymbols.erase(MCSN->getMCSymbol());
+     break;
+   }
+   case ISD::VALUETYPE: {
+     EVT VT = cast<VTSDNode>(N)->getVT();
+     if (VT.isExtended()) {
+       Erased = ExtendedValueTypeNodes.erase(VT);
+     } else {
+       Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != nullptr;
+       ValueTypeNodes[VT.getSimpleVT().SimpleTy] = nullptr;
+     }
+     break;
+   }
+   default:
+     // Remove it from the CSE Map.
+     assert(N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!");
+     assert(N->getOpcode() != ISD::EntryToken && "EntryToken in CSEMap!");
+     Erased = CSEMap.RemoveNode(N);
+     break;
+   }
+ #ifndef NDEBUG
+   // Verify that the node was actually in one of the CSE maps, unless it has a
+   // flag result (which cannot be CSE'd) or is one of the special cases that are
+   // not subject to CSE.
+   if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue &&
+       !N->isMachineOpcode() && !doNotCSE(N)) {
+     N->dump(this);
+     dbgs() << "\n";
+     llvm_unreachable("Node is not in map!");
+   }
+ #endif
+   return Erased;
+ }
+ 
+ /// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE
+ /// maps and modified in place. Add it back to the CSE maps, unless an identical
+ /// node already exists, in which case transfer all its users to the existing
+ /// node. This transfer can potentially trigger recursive merging.
+ void
+ SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
+   // For node types that aren't CSE'd, just act as if no identical node
+   // already exists.
+   if (!doNotCSE(N)) {
+     SDNode *Existing = CSEMap.GetOrInsertNode(N);
+     if (Existing != N) {
+       // If there was already an existing matching node, use ReplaceAllUsesWith
+       // to replace the dead one with the existing one.  This can cause
+       // recursive merging of other unrelated nodes down the line.
+       ReplaceAllUsesWith(N, Existing);
+ 
+       // N is now dead. Inform the listeners and delete it.
+       for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+         DUL->NodeDeleted(N, Existing);
+       DeleteNodeNotInCSEMaps(N);
+       return;
+     }
+   }
+ 
+   // If the node doesn't already exist, we updated it.  Inform listeners.
+   for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+     DUL->NodeUpdated(N);
+ }
+ 
+ /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+ /// were replaced with those specified.  If this node is never memoized,
+ /// return null, otherwise return a pointer to the slot it would take.  If a
+ /// node already exists with these operands, the slot will be non-null.
+ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
+                                            void *&InsertPos) {
+   if (doNotCSE(N))
+     return nullptr;
+ 
+   SDValue Ops[] = { Op };
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
+   AddNodeIDCustom(ID, N);
+   SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
+   if (Node)
+     Node->intersectFlagsWith(N->getFlags());
+   return Node;
+ }
+ 
+ /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+ /// were replaced with those specified.  If this node is never memoized,
+ /// return null, otherwise return a pointer to the slot it would take.  If a
+ /// node already exists with these operands, the slot will be non-null.
+ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
+                                            SDValue Op1, SDValue Op2,
+                                            void *&InsertPos) {
+   if (doNotCSE(N))
+     return nullptr;
+ 
+   SDValue Ops[] = { Op1, Op2 };
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
+   AddNodeIDCustom(ID, N);
+   SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
+   if (Node)
+     Node->intersectFlagsWith(N->getFlags());
+   return Node;
+ }
+ 
+ /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+ /// were replaced with those specified.  If this node is never memoized,
+ /// return null, otherwise return a pointer to the slot it would take.  If a
+ /// node already exists with these operands, the slot will be non-null.
+ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
+                                            void *&InsertPos) {
+   if (doNotCSE(N))
+     return nullptr;
+ 
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
+   AddNodeIDCustom(ID, N);
+   SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
+   if (Node)
+     Node->intersectFlagsWith(N->getFlags());
+   return Node;
+ }
+ 
+ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
+   Type *Ty = VT == MVT::iPTR ?
+                    PointerType::get(Type::getInt8Ty(*getContext()), 0) :
+                    VT.getTypeForEVT(*getContext());
+ 
+   return getDataLayout().getABITypeAlignment(Ty);
+ }
+ 
+ // EntryNode could meaningfully have debug info if we can find it...
+ SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
+     : TM(tm), OptLevel(OL),
+       EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
+       Root(getEntryNode()) {
+   InsertNode(&EntryNode);
+   DbgInfo = new SDDbgInfo();
+ }
+ 
+ void SelectionDAG::init(MachineFunction &NewMF,
+                         OptimizationRemarkEmitter &NewORE,
+                         Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
+                         LegacyDivergenceAnalysis * Divergence) {
+   MF = &NewMF;
+   SDAGISelPass = PassPtr;
+   ORE = &NewORE;
+   TLI = getSubtarget().getTargetLowering();
+   TSI = getSubtarget().getSelectionDAGInfo();
+   LibInfo = LibraryInfo;
+   Context = &MF->getFunction().getContext();
+   DA = Divergence;
+ }
+ 
+ SelectionDAG::~SelectionDAG() {
+   assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
+   allnodes_clear();
+   OperandRecycler.clear(OperandAllocator);
+   delete DbgInfo;
+ }
+ 
+ void SelectionDAG::allnodes_clear() {
+   assert(&*AllNodes.begin() == &EntryNode);
+   AllNodes.remove(AllNodes.begin());
+   while (!AllNodes.empty())
+     DeallocateNode(&AllNodes.front());
+ #ifndef NDEBUG
+   NextPersistentId = 0;
+ #endif
+ }
+ 
+ SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
+                                           void *&InsertPos) {
+   SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+   if (N) {
+     switch (N->getOpcode()) {
+     default: break;
+     case ISD::Constant:
+     case ISD::ConstantFP:
+       llvm_unreachable("Querying for Constant and ConstantFP nodes requires "
+                        "debug location.  Use another overload.");
+     }
+   }
+   return N;
+ }
+ 
+ SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
+                                           const SDLoc &DL, void *&InsertPos) {
+   SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+   if (N) {
+     switch (N->getOpcode()) {
+     case ISD::Constant:
+     case ISD::ConstantFP:
+       // Erase debug location from the node if the node is used at several
+       // different places. Do not propagate one location to all uses as it
+       // will cause a worse single stepping debugging experience.
+       if (N->getDebugLoc() != DL.getDebugLoc())
+         N->setDebugLoc(DebugLoc());
+       break;
+     default:
+       // When the node's point of use is located earlier in the instruction
+       // sequence than its prior point of use, update its debug info to the
+       // earlier location.
+       if (DL.getIROrder() && DL.getIROrder() < N->getIROrder())
+         N->setDebugLoc(DL.getDebugLoc());
+       break;
+     }
+   }
+   return N;
+ }
+ 
+ void SelectionDAG::clear() {
+   allnodes_clear();
+   OperandRecycler.clear(OperandAllocator);
+   OperandAllocator.Reset();
+   CSEMap.clear();
+ 
+   ExtendedValueTypeNodes.clear();
+   ExternalSymbols.clear();
+   TargetExternalSymbols.clear();
+   MCSymbols.clear();
+   std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
+             static_cast<CondCodeSDNode*>(nullptr));
+   std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
+             static_cast<SDNode*>(nullptr));
+ 
+   EntryNode.UseList = nullptr;
+   InsertNode(&EntryNode);
+   Root = getEntryNode();
+   DbgInfo->clear();
+ }
+ 
+ SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) {
+   return VT.bitsGT(Op.getValueType())
+              ? getNode(ISD::FP_EXTEND, DL, VT, Op)
+              : getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL));
+ }
+ 
+ SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
+   return VT.bitsGT(Op.getValueType()) ?
+     getNode(ISD::ANY_EXTEND, DL, VT, Op) :
+     getNode(ISD::TRUNCATE, DL, VT, Op);
+ }
+ 
+ SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
+   return VT.bitsGT(Op.getValueType()) ?
+     getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
+     getNode(ISD::TRUNCATE, DL, VT, Op);
+ }
+ 
+ SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
+   return VT.bitsGT(Op.getValueType()) ?
+     getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
+     getNode(ISD::TRUNCATE, DL, VT, Op);
+ }
+ 
+ SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT,
+                                         EVT OpVT) {
+   if (VT.bitsLE(Op.getValueType()))
+     return getNode(ISD::TRUNCATE, SL, VT, Op);
+ 
+   TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT);
+   return getNode(TLI->getExtendForContent(BType), SL, VT, Op);
+ }
+ 
+ SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
+   assert(!VT.isVector() &&
+          "getZeroExtendInReg should use the vector element type instead of "
+          "the vector type!");
+   if (Op.getValueType().getScalarType() == VT) return Op;
+   unsigned BitWidth = Op.getScalarValueSizeInBits();
+   APInt Imm = APInt::getLowBitsSet(BitWidth,
+                                    VT.getSizeInBits());
+   return getNode(ISD::AND, DL, Op.getValueType(), Op,
+                  getConstant(Imm, DL, Op.getValueType()));
+ }
+ 
+ /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
+ SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
+   EVT EltVT = VT.getScalarType();
+   SDValue NegOne =
+     getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, VT);
+   return getNode(ISD::XOR, DL, VT, Val, NegOne);
+ }
+ 
+ SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
+   SDValue TrueValue = getBoolConstant(true, DL, VT, VT);
+   return getNode(ISD::XOR, DL, VT, Val, TrueValue);
+ }
+ 
+ SDValue SelectionDAG::getBoolConstant(bool V, const SDLoc &DL, EVT VT,
+                                       EVT OpVT) {
+   if (!V)
+     return getConstant(0, DL, VT);
+ 
+   switch (TLI->getBooleanContents(OpVT)) {
+   case TargetLowering::ZeroOrOneBooleanContent:
+   case TargetLowering::UndefinedBooleanContent:
+     return getConstant(1, DL, VT);
+   case TargetLowering::ZeroOrNegativeOneBooleanContent:
+     return getAllOnesConstant(DL, VT);
+   }
+   llvm_unreachable("Unexpected boolean content enum!");
+ }
+ 
+ SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
+                                   bool isT, bool isO) {
+   EVT EltVT = VT.getScalarType();
+   assert((EltVT.getSizeInBits() >= 64 ||
+          (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
+          "getConstant with a uint64_t value that doesn't fit in the type!");
+   return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO);
+ }
+ 
+ SDValue SelectionDAG::getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
+                                   bool isT, bool isO) {
+   return getConstant(*ConstantInt::get(*Context, Val), DL, VT, isT, isO);
+ }
+ 
+ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
+                                   EVT VT, bool isT, bool isO) {
+   assert(VT.isInteger() && "Cannot create FP integer constant!");
+ 
+   EVT EltVT = VT.getScalarType();
+   const ConstantInt *Elt = &Val;
+ 
+   // In some cases the vector type is legal but the element type is illegal and
+   // needs to be promoted, for example v8i8 on ARM.  In this case, promote the
+   // inserted value (the type does not need to match the vector element type).
+   // Any extra bits introduced will be truncated away.
+   if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
+       TargetLowering::TypePromoteInteger) {
+    EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+    APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
+    Elt = ConstantInt::get(*getContext(), NewVal);
+   }
+   // In other cases the element type is illegal and needs to be expanded, for
+   // example v2i64 on MIPS32. In this case, find the nearest legal type, split
+   // the value into n parts and use a vector type with n-times the elements.
+   // Then bitcast to the type requested.
+   // Legalizing constants too early makes the DAGCombiner's job harder so we
+   // only legalize if the DAG tells us we must produce legal types.
+   else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
+            TLI->getTypeAction(*getContext(), EltVT) ==
+            TargetLowering::TypeExpandInteger) {
+     const APInt &NewVal = Elt->getValue();
+     EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+     unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
+     unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
+     EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
+ 
+     // Check the temporary vector is the correct size. If this fails then
+     // getTypeToTransformTo() probably returned a type whose size (in bits)
+     // isn't a power-of-2 factor of the requested type size.
+     assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
+ 
+     SmallVector<SDValue, 2> EltParts;
+     for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
+       EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
+                                            .zextOrTrunc(ViaEltSizeInBits), DL,
+                                      ViaEltVT, isT, isO));
+     }
+ 
+     // EltParts is currently in little endian order. If we actually want
+     // big-endian order then reverse it now.
+     if (getDataLayout().isBigEndian())
+       std::reverse(EltParts.begin(), EltParts.end());
+ 
+     // The elements must be reversed when the element order is different
+     // to the endianness of the elements (because the BITCAST is itself a
+     // vector shuffle in this situation). However, we do not need any code to
+     // perform this reversal because getConstant() is producing a vector
+     // splat.
+     // This situation occurs in MIPS MSA.
+ 
+     SmallVector<SDValue, 8> Ops;
+     for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+       Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());
+ 
+     SDValue V = getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
+     return V;
+   }
+ 
+   assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
+          "APInt size does not match type size!");
+   unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
+   ID.AddPointer(Elt);
+   ID.AddBoolean(isO);
+   void *IP = nullptr;
+   SDNode *N = nullptr;
+   if ((N = FindNodeOrInsertPos(ID, DL, IP)))
+     if (!VT.isVector())
+       return SDValue(N, 0);
+ 
+   if (!N) {
+     N = newSDNode<ConstantSDNode>(isT, isO, Elt, EltVT);
+     CSEMap.InsertNode(N, IP);
+     InsertNode(N);
+     NewSDValueDbgMsg(SDValue(N, 0), "Creating constant: ", this);
+   }
+ 
+   SDValue Result(N, 0);
+   if (VT.isVector())
+     Result = getSplatBuildVector(VT, DL, Result);
+ 
+   return Result;
+ }
+ 
+ SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
+                                         bool isTarget) {
+   return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget);
+ }
+ 
+ SDValue SelectionDAG::getConstantFP(const APFloat &V, const SDLoc &DL, EVT VT,
+                                     bool isTarget) {
+   return getConstantFP(*ConstantFP::get(*getContext(), V), DL, VT, isTarget);
+ }
+ 
+ SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
+                                     EVT VT, bool isTarget) {
+   assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");
+ 
+   EVT EltVT = VT.getScalarType();
+ 
+   // Do the map lookup using the actual bit pattern for the floating point
+   // value, so that we don't have problems with 0.0 comparing equal to -0.0, and
+   // we don't have issues with SNANs.
+   unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
+   ID.AddPointer(&V);
+   void *IP = nullptr;
+   SDNode *N = nullptr;
+   if ((N = FindNodeOrInsertPos(ID, DL, IP)))
+     if (!VT.isVector())
+       return SDValue(N, 0);
+ 
+   if (!N) {
+     N = newSDNode<ConstantFPSDNode>(isTarget, &V, EltVT);
+     CSEMap.InsertNode(N, IP);
+     InsertNode(N);
+   }
+ 
+   SDValue Result(N, 0);
+   if (VT.isVector())
+     Result = getSplatBuildVector(VT, DL, Result);
+   NewSDValueDbgMsg(Result, "Creating fp constant: ", this);
+   return Result;
+ }
+ 
+ SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
+                                     bool isTarget) {
+   EVT EltVT = VT.getScalarType();
+   if (EltVT == MVT::f32)
+     return getConstantFP(APFloat((float)Val), DL, VT, isTarget);
+   else if (EltVT == MVT::f64)
+     return getConstantFP(APFloat(Val), DL, VT, isTarget);
+   else if (EltVT == MVT::f80 || EltVT == MVT::f128 || EltVT == MVT::ppcf128 ||
+            EltVT == MVT::f16) {
+     bool Ignored;
+     APFloat APF = APFloat(Val);
+     APF.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
+                 &Ignored);
+     return getConstantFP(APF, DL, VT, isTarget);
+   } else
+     llvm_unreachable("Unsupported type in getConstantFP");
+ }
+ 
+ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
+                                        EVT VT, int64_t Offset, bool isTargetGA,
+                                        unsigned char TargetFlags) {
+   assert((TargetFlags == 0 || isTargetGA) &&
+          "Cannot set target flags on target-independent globals");
+ 
+   // Truncate (with sign-extension) the offset value to the pointer size.
+   unsigned BitWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
+   if (BitWidth < 64)
+     Offset = SignExtend64(Offset, BitWidth);
+ 
+   unsigned Opc;
+   if (GV->isThreadLocal())
+     Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
+   else
+     Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
+ 
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, Opc, getVTList(VT), None);
+   ID.AddPointer(GV);
+   ID.AddInteger(Offset);
+   ID.AddInteger(TargetFlags);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<GlobalAddressSDNode>(
+       Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags);
+   CSEMap.InsertNode(N, IP);
+     InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
+   unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, Opc, getVTList(VT), None);
+   ID.AddInteger(FI);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<FrameIndexSDNode>(FI, VT, isTarget);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
+                                    unsigned char TargetFlags) {
+   assert((TargetFlags == 0 || isTarget) &&
+          "Cannot set target flags on target-independent jump tables");
+   unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, Opc, getVTList(VT), None);
+   ID.AddInteger(JTI);
+   ID.AddInteger(TargetFlags);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<JumpTableSDNode>(JTI, VT, isTarget, TargetFlags);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
+                                       unsigned Alignment, int Offset,
+                                       bool isTarget,
+                                       unsigned char TargetFlags) {
+   assert((TargetFlags == 0 || isTarget) &&
+          "Cannot set target flags on target-independent globals");
+   if (Alignment == 0)
+     Alignment = MF->getFunction().optForSize()
+                     ? getDataLayout().getABITypeAlignment(C->getType())
+                     : getDataLayout().getPrefTypeAlignment(C->getType());
+   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, Opc, getVTList(VT), None);
+   ID.AddInteger(Alignment);
+   ID.AddInteger(Offset);
+   ID.AddPointer(C);
+   ID.AddInteger(TargetFlags);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
+                                           TargetFlags);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
+                                       unsigned Alignment, int Offset,
+                                       bool isTarget,
+                                       unsigned char TargetFlags) {
+   assert((TargetFlags == 0 || isTarget) &&
+          "Cannot set target flags on target-independent globals");
+   if (Alignment == 0)
+     Alignment = getDataLayout().getPrefTypeAlignment(C->getType());
+   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, Opc, getVTList(VT), None);
+   ID.AddInteger(Alignment);
+   ID.AddInteger(Offset);
+   C->addSelectionDAGCSEId(ID);
+   ID.AddInteger(TargetFlags);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
+                                           TargetFlags);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
+                                      unsigned char TargetFlags) {
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None);
+   ID.AddInteger(Index);
+   ID.AddInteger(Offset);
+   ID.AddInteger(TargetFlags);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<TargetIndexSDNode>(Index, VT, Offset, TargetFlags);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), None);
+   ID.AddPointer(MBB);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<BasicBlockSDNode>(MBB);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getValueType(EVT VT) {
+   if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >=
+       ValueTypeNodes.size())
+     ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1);
+ 
+   SDNode *&N = VT.isExtended() ?
+     ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy];
+ 
+   if (N) return SDValue(N, 0);
+   N = newSDNode<VTSDNode>(VT);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
+   SDNode *&N = ExternalSymbols[Sym];
+   if (N) return SDValue(N, 0);
+   N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, VT);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
+   SDNode *&N = MCSymbols[Sym];
+   if (N)
+     return SDValue(N, 0);
+   N = newSDNode<MCSymbolSDNode>(Sym, VT);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
+                                               unsigned char TargetFlags) {
+   SDNode *&N =
+     TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
+                                                                TargetFlags)];
+   if (N) return SDValue(N, 0);
+   N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
+   if ((unsigned)Cond >= CondCodeNodes.size())
+     CondCodeNodes.resize(Cond+1);
+ 
+   if (!CondCodeNodes[Cond]) {
+     auto *N = newSDNode<CondCodeSDNode>(Cond);
+     CondCodeNodes[Cond] = N;
+     InsertNode(N);
+   }
+ 
+   return SDValue(CondCodeNodes[Cond], 0);
+ }
+ 
+ /// Swaps the values of N1 and N2. Swaps all indices in the shuffle mask M that
+ /// point at N1 to point at N2 and indices that point at N2 to point at N1.
+ static void commuteShuffle(SDValue &N1, SDValue &N2, MutableArrayRef<int> M) {
+   std::swap(N1, N2);
+   ShuffleVectorSDNode::commuteMask(M);
+ }
+ 
+ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
+                                        SDValue N2, ArrayRef<int> Mask) {
+   assert(VT.getVectorNumElements() == Mask.size() &&
+            "Must have the same number of vector elements as mask elements!");
+   assert(VT == N1.getValueType() && VT == N2.getValueType() &&
+          "Invalid VECTOR_SHUFFLE");
+ 
+   // Canonicalize shuffle undef, undef -> undef
+   if (N1.isUndef() && N2.isUndef())
+     return getUNDEF(VT);
+ 
+   // Validate that all indices in Mask are within the range of the elements
+   // input to the shuffle.
+   int NElts = Mask.size();
+   assert(llvm::all_of(Mask,
+                       [&](int M) { return M < (NElts * 2) && M >= -1; }) &&
+          "Index out of range");
+ 
+   // Copy the mask so we can do any needed cleanup.
+   SmallVector<int, 8> MaskVec(Mask.begin(), Mask.end());
+ 
+   // Canonicalize shuffle v, v -> v, undef
+   if (N1 == N2) {
+     N2 = getUNDEF(VT);
+     for (int i = 0; i != NElts; ++i)
+       if (MaskVec[i] >= NElts) MaskVec[i] -= NElts;
+   }
+ 
+   // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
+   if (N1.isUndef())
+     commuteShuffle(N1, N2, MaskVec);
+ 
+   if (TLI->hasVectorBlend()) {
+     // If shuffling a splat, try to blend the splat instead. We do this here so
+     // that even when this arises during lowering we don't have to re-handle it.
+     auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
+       BitVector UndefElements;
+       SDValue Splat = BV->getSplatValue(&UndefElements);
+       if (!Splat)
+         return;
+ 
+       for (int i = 0; i < NElts; ++i) {
+         if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts))
+           continue;
+ 
+         // If this input comes from undef, mark it as such.
+         if (UndefElements[MaskVec[i] - Offset]) {
+           MaskVec[i] = -1;
+           continue;
+         }
+ 
+         // If we can blend a non-undef lane, use that instead.
+         if (!UndefElements[i])
+           MaskVec[i] = i + Offset;
+       }
+     };
+     if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+       BlendSplat(N1BV, 0);
+     if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
+       BlendSplat(N2BV, NElts);
+   }
+ 
+   // Canonicalize all index into lhs, -> shuffle lhs, undef
+   // Canonicalize all index into rhs, -> shuffle rhs, undef
+   bool AllLHS = true, AllRHS = true;
+   bool N2Undef = N2.isUndef();
+   for (int i = 0; i != NElts; ++i) {
+     if (MaskVec[i] >= NElts) {
+       if (N2Undef)
+         MaskVec[i] = -1;
+       else
+         AllLHS = false;
+     } else if (MaskVec[i] >= 0) {
+       AllRHS = false;
+     }
+   }
+   if (AllLHS && AllRHS)
+     return getUNDEF(VT);
+   if (AllLHS && !N2Undef)
+     N2 = getUNDEF(VT);
+   if (AllRHS) {
+     N1 = getUNDEF(VT);
+     commuteShuffle(N1, N2, MaskVec);
+   }
+   // Reset our undef status after accounting for the mask.
+   N2Undef = N2.isUndef();
+   // Re-check whether both sides ended up undef.
+   if (N1.isUndef() && N2Undef)
+     return getUNDEF(VT);
+ 
+   // If Identity shuffle return that node.
+   bool Identity = true, AllSame = true;
+   for (int i = 0; i != NElts; ++i) {
+     if (MaskVec[i] >= 0 && MaskVec[i] != i) Identity = false;
+     if (MaskVec[i] != MaskVec[0]) AllSame = false;
+   }
+   if (Identity && NElts)
+     return N1;
+ 
+   // Shuffling a constant splat doesn't change the result.
+   if (N2Undef) {
+     SDValue V = N1;
+ 
+     // Look through any bitcasts. We check that these don't change the number
+     // (and size) of elements and just changes their types.
+     while (V.getOpcode() == ISD::BITCAST)
+       V = V->getOperand(0);
+ 
+     // A splat should always show up as a build vector node.
+     if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
+       BitVector UndefElements;
+       SDValue Splat = BV->getSplatValue(&UndefElements);
+       // If this is a splat of an undef, shuffling it is also undef.
+       if (Splat && Splat.isUndef())
+         return getUNDEF(VT);
+ 
+       bool SameNumElts =
+           V.getValueType().getVectorNumElements() == VT.getVectorNumElements();
+ 
+       // We only have a splat which can skip shuffles if there is a splatted
+       // value and no undef lanes rearranged by the shuffle.
+       if (Splat && UndefElements.none()) {
+         // Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
+         // number of elements match or the value splatted is a zero constant.
+         if (SameNumElts)
+           return N1;
+         if (auto *C = dyn_cast<ConstantSDNode>(Splat))
+           if (C->isNullValue())
+             return N1;
+       }
+ 
+       // If the shuffle itself creates a splat, build the vector directly.
+       if (AllSame && SameNumElts) {
+         EVT BuildVT = BV->getValueType(0);
+         const SDValue &Splatted = BV->getOperand(MaskVec[0]);
+         SDValue NewBV = getSplatBuildVector(BuildVT, dl, Splatted);
+ 
+         // We may have jumped through bitcasts, so the type of the
+         // BUILD_VECTOR may not match the type of the shuffle.
+         if (BuildVT != VT)
+           NewBV = getNode(ISD::BITCAST, dl, VT, NewBV);
+         return NewBV;
+       }
+     }
+   }
+ 
+   FoldingSetNodeID ID;
+   SDValue Ops[2] = { N1, N2 };
+   AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops);
+   for (int i = 0; i != NElts; ++i)
+     ID.AddInteger(MaskVec[i]);
+ 
+   void* IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+     return SDValue(E, 0);
+ 
+   // Allocate the mask array for the node out of the BumpPtrAllocator, since
+   // SDNode doesn't have access to it.  This memory will be "leaked" when
+   // the node is deallocated, but recovered when the NodeAllocator is released.
+   int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
+   llvm::copy(MaskVec, MaskAlloc);
+ 
+   auto *N = newSDNode<ShuffleVectorSDNode>(VT, dl.getIROrder(),
+                                            dl.getDebugLoc(), MaskAlloc);
+   createOperands(N, Ops);
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   SDValue V = SDValue(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
+   EVT VT = SV.getValueType(0);
+   SmallVector<int, 8> MaskVec(SV.getMask().begin(), SV.getMask().end());
+   ShuffleVectorSDNode::commuteMask(MaskVec);
+ 
+   SDValue Op0 = SV.getOperand(0);
+   SDValue Op1 = SV.getOperand(1);
+   return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec);
+ }
+ 
+ SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::Register, getVTList(VT), None);
+   ID.AddInteger(RegNo);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<RegisterSDNode>(RegNo, VT);
+   N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), None);
+   ID.AddPointer(RegMask);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<RegisterMaskSDNode>(RegMask);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getEHLabel(const SDLoc &dl, SDValue Root,
+                                  MCSymbol *Label) {
+   return getLabelNode(ISD::EH_LABEL, dl, Root, Label);
+ }
+ 
+ SDValue SelectionDAG::getLabelNode(unsigned Opcode, const SDLoc &dl,
+                                    SDValue Root, MCSymbol *Label) {
+   FoldingSetNodeID ID;
+   SDValue Ops[] = { Root };
+   AddNodeIDNode(ID, Opcode, getVTList(MVT::Other), Ops);
+   ID.AddPointer(Label);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<LabelSDNode>(dl.getIROrder(), dl.getDebugLoc(), Label);
+   createOperands(N, Ops);
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
+                                       int64_t Offset,
+                                       bool isTarget,
+                                       unsigned char TargetFlags) {
+   unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
+ 
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, Opc, getVTList(VT), None);
+   ID.AddPointer(BA);
+   ID.AddInteger(Offset);
+   ID.AddInteger(TargetFlags);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<BlockAddressSDNode>(Opc, VT, BA, Offset, TargetFlags);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getSrcValue(const Value *V) {
+   assert((!V || V->getType()->isPointerTy()) &&
+          "SrcValue is not a pointer?");
+ 
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None);
+   ID.AddPointer(V);
+ 
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<SrcValueSDNode>(V);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getMDNode(const MDNode *MD) {
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), None);
+   ID.AddPointer(MD);
+ 
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<MDNodeSDNode>(MD);
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getBitcast(EVT VT, SDValue V) {
+   if (VT == V.getValueType())
+     return V;
+ 
+   return getNode(ISD::BITCAST, SDLoc(V), VT, V);
+ }
+ 
+ SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
+                                        unsigned SrcAS, unsigned DestAS) {
+   SDValue Ops[] = {Ptr};
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops);
+   ID.AddInteger(SrcAS);
+   ID.AddInteger(DestAS);
+ 
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<AddrSpaceCastSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                            VT, SrcAS, DestAS);
+   createOperands(N, Ops);
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ /// getShiftAmountOperand - Return the specified value casted to
+ /// the target's desired shift amount type.
+ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
+   EVT OpTy = Op.getValueType();
+   EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout());
+   if (OpTy == ShTy || OpTy.isVector()) return Op;
+ 
+   return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
+ }
+ 
+ SDValue SelectionDAG::expandVAArg(SDNode *Node) {
+   SDLoc dl(Node);
+   const TargetLowering &TLI = getTargetLoweringInfo();
+   const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+   EVT VT = Node->getValueType(0);
+   SDValue Tmp1 = Node->getOperand(0);
+   SDValue Tmp2 = Node->getOperand(1);
+   unsigned Align = Node->getConstantOperandVal(3);
+ 
+   SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1,
+                                Tmp2, MachinePointerInfo(V));
+   SDValue VAList = VAListLoad;
+ 
+   if (Align > TLI.getMinStackArgumentAlignment()) {
+     assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+ 
+     VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
+                      getConstant(Align - 1, dl, VAList.getValueType()));
+ 
+     VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList,
+                      getConstant(-(int64_t)Align, dl, VAList.getValueType()));
+   }
+ 
+   // Increment the pointer, VAList, to the next vaarg
+   Tmp1 = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
+                  getConstant(getDataLayout().getTypeAllocSize(
+                                                VT.getTypeForEVT(*getContext())),
+                              dl, VAList.getValueType()));
+   // Store the incremented VAList to the legalized pointer
+   Tmp1 =
+       getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2, MachinePointerInfo(V));
+   // Load the actual argument out of the pointer VAList
+   return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo());
+ }
+ 
+ SDValue SelectionDAG::expandVACopy(SDNode *Node) {
+   SDLoc dl(Node);
+   const TargetLowering &TLI = getTargetLoweringInfo();
+   // This defaults to loading a pointer from the input and storing it to the
+   // output, returning the chain.
+   const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
+   const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
+   SDValue Tmp1 =
+       getLoad(TLI.getPointerTy(getDataLayout()), dl, Node->getOperand(0),
+               Node->getOperand(2), MachinePointerInfo(VS));
+   return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
+                   MachinePointerInfo(VD));
+ }
+ 
+ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
+   MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
+   unsigned ByteSize = VT.getStoreSize();
+   Type *Ty = VT.getTypeForEVT(*getContext());
+   unsigned StackAlign =
+       std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign);
+ 
+   int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
+   return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
+ }
+ 
+ SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
+   unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
+   Type *Ty1 = VT1.getTypeForEVT(*getContext());
+   Type *Ty2 = VT2.getTypeForEVT(*getContext());
+   const DataLayout &DL = getDataLayout();
+   unsigned Align =
+       std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2));
+ 
+   MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
+   int FrameIdx = MFI.CreateStackObject(Bytes, Align, false);
+   return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
+ }
+ 
+ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
+                                 ISD::CondCode Cond, const SDLoc &dl) {
+   EVT OpVT = N1.getValueType();
+ 
+   // These setcc operations always fold.
+   switch (Cond) {
+   default: break;
+   case ISD::SETFALSE:
+   case ISD::SETFALSE2: return getBoolConstant(false, dl, VT, OpVT);
+   case ISD::SETTRUE:
+   case ISD::SETTRUE2: return getBoolConstant(true, dl, VT, OpVT);
+ 
+   case ISD::SETOEQ:
+   case ISD::SETOGT:
+   case ISD::SETOGE:
+   case ISD::SETOLT:
+   case ISD::SETOLE:
+   case ISD::SETONE:
+   case ISD::SETO:
+   case ISD::SETUO:
+   case ISD::SETUEQ:
+   case ISD::SETUNE:
+     assert(!N1.getValueType().isInteger() && "Illegal setcc for integer!");
+     break;
+   }
+ 
+   if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2)) {
+     const APInt &C2 = N2C->getAPIntValue();
+     if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
+       const APInt &C1 = N1C->getAPIntValue();
+ 
+       switch (Cond) {
+       default: llvm_unreachable("Unknown integer setcc!");
+       case ISD::SETEQ:  return getBoolConstant(C1 == C2, dl, VT, OpVT);
+       case ISD::SETNE:  return getBoolConstant(C1 != C2, dl, VT, OpVT);
+       case ISD::SETULT: return getBoolConstant(C1.ult(C2), dl, VT, OpVT);
+       case ISD::SETUGT: return getBoolConstant(C1.ugt(C2), dl, VT, OpVT);
+       case ISD::SETULE: return getBoolConstant(C1.ule(C2), dl, VT, OpVT);
+       case ISD::SETUGE: return getBoolConstant(C1.uge(C2), dl, VT, OpVT);
+       case ISD::SETLT:  return getBoolConstant(C1.slt(C2), dl, VT, OpVT);
+       case ISD::SETGT:  return getBoolConstant(C1.sgt(C2), dl, VT, OpVT);
+       case ISD::SETLE:  return getBoolConstant(C1.sle(C2), dl, VT, OpVT);
+       case ISD::SETGE:  return getBoolConstant(C1.sge(C2), dl, VT, OpVT);
+       }
+     }
+   }
+   if (ConstantFPSDNode *N1C = dyn_cast<ConstantFPSDNode>(N1)) {
+     if (ConstantFPSDNode *N2C = dyn_cast<ConstantFPSDNode>(N2)) {
+       APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF());
+       switch (Cond) {
+       default: break;
+       case ISD::SETEQ:  if (R==APFloat::cmpUnordered)
+                           return getUNDEF(VT);
+                         LLVM_FALLTHROUGH;
+       case ISD::SETOEQ: return getBoolConstant(R==APFloat::cmpEqual, dl, VT,
+                                                OpVT);
+       case ISD::SETNE:  if (R==APFloat::cmpUnordered)
+                           return getUNDEF(VT);
+                         LLVM_FALLTHROUGH;
+       case ISD::SETONE: return getBoolConstant(R==APFloat::cmpGreaterThan ||
+                                                R==APFloat::cmpLessThan, dl, VT,
+                                                OpVT);
+       case ISD::SETLT:  if (R==APFloat::cmpUnordered)
+                           return getUNDEF(VT);
+                         LLVM_FALLTHROUGH;
+       case ISD::SETOLT: return getBoolConstant(R==APFloat::cmpLessThan, dl, VT,
+                                                OpVT);
+       case ISD::SETGT:  if (R==APFloat::cmpUnordered)
+                           return getUNDEF(VT);
+                         LLVM_FALLTHROUGH;
+       case ISD::SETOGT: return getBoolConstant(R==APFloat::cmpGreaterThan, dl,
+                                                VT, OpVT);
+       case ISD::SETLE:  if (R==APFloat::cmpUnordered)
+                           return getUNDEF(VT);
+                         LLVM_FALLTHROUGH;
+       case ISD::SETOLE: return getBoolConstant(R==APFloat::cmpLessThan ||
+                                                R==APFloat::cmpEqual, dl, VT,
+                                                OpVT);
+       case ISD::SETGE:  if (R==APFloat::cmpUnordered)
+                           return getUNDEF(VT);
+                         LLVM_FALLTHROUGH;
+       case ISD::SETOGE: return getBoolConstant(R==APFloat::cmpGreaterThan ||
+                                            R==APFloat::cmpEqual, dl, VT, OpVT);
+       case ISD::SETO:   return getBoolConstant(R!=APFloat::cmpUnordered, dl, VT,
+                                                OpVT);
+       case ISD::SETUO:  return getBoolConstant(R==APFloat::cmpUnordered, dl, VT,
+                                                OpVT);
+       case ISD::SETUEQ: return getBoolConstant(R==APFloat::cmpUnordered ||
+                                                R==APFloat::cmpEqual, dl, VT,
+                                                OpVT);
+       case ISD::SETUNE: return getBoolConstant(R!=APFloat::cmpEqual, dl, VT,
+                                                OpVT);
+       case ISD::SETULT: return getBoolConstant(R==APFloat::cmpUnordered ||
+                                                R==APFloat::cmpLessThan, dl, VT,
+                                                OpVT);
+       case ISD::SETUGT: return getBoolConstant(R==APFloat::cmpGreaterThan ||
+                                                R==APFloat::cmpUnordered, dl, VT,
+                                                OpVT);
+       case ISD::SETULE: return getBoolConstant(R!=APFloat::cmpGreaterThan, dl,
+                                                VT, OpVT);
+       case ISD::SETUGE: return getBoolConstant(R!=APFloat::cmpLessThan, dl, VT,
+                                                OpVT);
+       }
+     } else {
+       // Ensure that the constant occurs on the RHS.
+       ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
+       MVT CompVT = N1.getValueType().getSimpleVT();
+       if (!TLI->isCondCodeLegal(SwappedCond, CompVT))
+         return SDValue();
+ 
+       return getSetCC(dl, VT, N2, N1, SwappedCond);
+     }
+   }
+ 
+   // Could not fold it.
+   return SDValue();
+ }
+ 
+ /// See if the specified operand can be simplified with the knowledge that only
+ /// the bits specified by Mask are used.
+ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &Mask) {
+   switch (V.getOpcode()) {
+   default:
+     break;
+   case ISD::Constant: {
+     const ConstantSDNode *CV = cast<ConstantSDNode>(V.getNode());
+     assert(CV && "Const value should be ConstSDNode.");
+     const APInt &CVal = CV->getAPIntValue();
+     APInt NewVal = CVal & Mask;
+     if (NewVal != CVal)
+       return getConstant(NewVal, SDLoc(V), V.getValueType());
+     break;
+   }
+   case ISD::OR:
+   case ISD::XOR:
+     // If the LHS or RHS don't contribute bits to the or, drop them.
+     if (MaskedValueIsZero(V.getOperand(0), Mask))
+       return V.getOperand(1);
+     if (MaskedValueIsZero(V.getOperand(1), Mask))
+       return V.getOperand(0);
+     break;
+   case ISD::SRL:
+     // Only look at single-use SRLs.
+     if (!V.getNode()->hasOneUse())
+       break;
+     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
+       // See if we can recursively simplify the LHS.
+       unsigned Amt = RHSC->getZExtValue();
+ 
+       // Watch out for shift count overflow though.
+       if (Amt >= Mask.getBitWidth())
+         break;
+       APInt NewMask = Mask << Amt;
+       if (SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask))
+         return getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS,
+                        V.getOperand(1));
+     }
+     break;
+   case ISD::AND: {
+     // X & -1 -> X (ignoring bits which aren't demanded).
+     ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1));
+     if (AndVal && Mask.isSubsetOf(AndVal->getAPIntValue()))
+       return V.getOperand(0);
+     break;
+   }
+   case ISD::ANY_EXTEND: {
+     SDValue Src = V.getOperand(0);
+     unsigned SrcBitWidth = Src.getScalarValueSizeInBits();
+     // Being conservative here - only peek through if we only demand bits in the
+     // non-extended source (even though the extended bits are technically undef).
+     if (Mask.getActiveBits() > SrcBitWidth)
+       break;
+     APInt SrcMask = Mask.trunc(SrcBitWidth);
+     if (SDValue DemandedSrc = GetDemandedBits(Src, SrcMask))
+       return getNode(ISD::ANY_EXTEND, SDLoc(V), V.getValueType(), DemandedSrc);
+     break;
+   }
+   }
+   return SDValue();
+ }
+ 
+ /// SignBitIsZero - Return true if the sign bit of Op is known to be zero.  We
+ /// use this predicate to simplify operations downstream.
+ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
+   unsigned BitWidth = Op.getScalarValueSizeInBits();
+   return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
+ }
+ 
+ /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
+ /// this predicate to simplify operations downstream.  Mask is known to be zero
+ /// for bits that V cannot have.
+ bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
+                                      unsigned Depth) const {
+   return Mask.isSubsetOf(computeKnownBits(Op, Depth).Zero);
+ }
+ 
+ /// isSplatValue - Return true if the vector V has the same value
+ /// across all DemandedElts.
+ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
+                                 APInt &UndefElts) {
+   if (!DemandedElts)
+     return false; // No demanded elts, better to assume we don't know anything.
+ 
+   EVT VT = V.getValueType();
+   assert(VT.isVector() && "Vector type expected");
+ 
+   unsigned NumElts = VT.getVectorNumElements();
+   assert(NumElts == DemandedElts.getBitWidth() && "Vector size mismatch");
+   UndefElts = APInt::getNullValue(NumElts);
+ 
+   switch (V.getOpcode()) {
+   case ISD::BUILD_VECTOR: {
+     SDValue Scl;
+     for (unsigned i = 0; i != NumElts; ++i) {
+       SDValue Op = V.getOperand(i);
+       if (Op.isUndef()) {
+         UndefElts.setBit(i);
+         continue;
+       }
+       if (!DemandedElts[i])
+         continue;
+       if (Scl && Scl != Op)
+         return false;
+       Scl = Op;
+     }
+     return true;
+   }
+   case ISD::VECTOR_SHUFFLE: {
+     // Check if this is a shuffle node doing a splat.
+     // TODO: Do we need to handle shuffle(splat, undef, mask)?
+     int SplatIndex = -1;
+     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(V)->getMask();
+     for (int i = 0; i != (int)NumElts; ++i) {
+       int M = Mask[i];
+       if (M < 0) {
+         UndefElts.setBit(i);
+         continue;
+       }
+       if (!DemandedElts[i])
+         continue;
+       if (0 <= SplatIndex && SplatIndex != M)
+         return false;
+       SplatIndex = M;
+     }
+     return true;
+   }
+   case ISD::EXTRACT_SUBVECTOR: {
+     SDValue Src = V.getOperand(0);
+     ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(V.getOperand(1));
+     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+     if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+       // Offset the demanded elts by the subvector index.
+       uint64_t Idx = SubIdx->getZExtValue();
+       APInt UndefSrcElts;
+       APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+       if (isSplatValue(Src, DemandedSrc, UndefSrcElts)) {
+         UndefElts = UndefSrcElts.extractBits(NumElts, Idx);
+         return true;
+       }
+     }
+     break;
+   }
+   case ISD::ADD:
+   case ISD::SUB:
+   case ISD::AND: {
+     APInt UndefLHS, UndefRHS;
+     SDValue LHS = V.getOperand(0);
+     SDValue RHS = V.getOperand(1);
+     if (isSplatValue(LHS, DemandedElts, UndefLHS) &&
+         isSplatValue(RHS, DemandedElts, UndefRHS)) {
+       UndefElts = UndefLHS | UndefRHS;
+       return true;
+     }
+     break;
+   }
+   }
+ 
+   return false;
+ }
+ 
+ /// Helper wrapper to main isSplatValue function.
+ bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) {
+   EVT VT = V.getValueType();
+   assert(VT.isVector() && "Vector type expected");
+   unsigned NumElts = VT.getVectorNumElements();
+ 
+   APInt UndefElts;
+   APInt DemandedElts = APInt::getAllOnesValue(NumElts);
+   return isSplatValue(V, DemandedElts, UndefElts) &&
+          (AllowUndefs || !UndefElts);
+ }
+ 
+ /// Helper function that checks to see if a node is a constant or a
+ /// build vector of splat constants at least within the demanded elts.
+ static ConstantSDNode *isConstOrDemandedConstSplat(SDValue N,
+                                                    const APInt &DemandedElts) {
+   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
+     return CN;
+   if (N.getOpcode() != ISD::BUILD_VECTOR)
+     return nullptr;
+   EVT VT = N.getValueType();
+   ConstantSDNode *Cst = nullptr;
+   unsigned NumElts = VT.getVectorNumElements();
+   assert(DemandedElts.getBitWidth() == NumElts && "Unexpected vector size");
+   for (unsigned i = 0; i != NumElts; ++i) {
+     if (!DemandedElts[i])
+       continue;
+     ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(i));
+     if (!C || (Cst && Cst->getAPIntValue() != C->getAPIntValue()) ||
+         C->getValueType(0) != VT.getScalarType())
+       return nullptr;
+     Cst = C;
+   }
+   return Cst;
+ }
+ 
+ /// If a SHL/SRA/SRL node has a constant or splat constant shift amount that
+ /// is less than the element bit-width of the shift node, return it.
+ static const APInt *getValidShiftAmountConstant(SDValue V) {
+   if (ConstantSDNode *SA = isConstOrConstSplat(V.getOperand(1))) {
+     // Shifting more than the bitwidth is not valid.
+     const APInt &ShAmt = SA->getAPIntValue();
+     if (ShAmt.ult(V.getScalarValueSizeInBits()))
+       return &ShAmt;
+   }
+   return nullptr;
+ }
+ 
+ /// Determine which bits of Op are known to be either zero or one and return
+ /// them in Known. For vectors, the known bits are those that are shared by
+ /// every vector element.
+ KnownBits SelectionDAG::computeKnownBits(SDValue Op, unsigned Depth) const {
+   EVT VT = Op.getValueType();
+   APInt DemandedElts = VT.isVector()
+                            ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                            : APInt(1, 1);
+   return computeKnownBits(Op, DemandedElts, Depth);
+ }
+ 
+ /// Determine which bits of Op are known to be either zero or one and return
+ /// them in Known. The DemandedElts argument allows us to only collect the known
+ /// bits that are shared by the requested vector elements.
+ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
+                                          unsigned Depth) const {
+   unsigned BitWidth = Op.getScalarValueSizeInBits();
+ 
+   KnownBits Known(BitWidth);   // Don't know anything.
+ 
+   if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+     // We know all of the bits for a constant!
+     Known.One = C->getAPIntValue();
+     Known.Zero = ~Known.One;
+     return Known;
+   }
+   if (auto *C = dyn_cast<ConstantFPSDNode>(Op)) {
+     // We know all of the bits for a constant fp!
+     Known.One = C->getValueAPF().bitcastToAPInt();
+     Known.Zero = ~Known.One;
+     return Known;
+   }
+ 
+   if (Depth == 6)
+     return Known;  // Limit search depth.
+ 
+   KnownBits Known2;
+   unsigned NumElts = DemandedElts.getBitWidth();
+   assert((!Op.getValueType().isVector() ||
+           NumElts == Op.getValueType().getVectorNumElements()) &&
+          "Unexpected vector size");
+ 
+   if (!DemandedElts)
+     return Known;  // No demanded elts, better to assume we don't know anything.
+ 
+   unsigned Opcode = Op.getOpcode();
+   switch (Opcode) {
+   case ISD::BUILD_VECTOR:
+     // Collect the known bits that are shared by every demanded vector element.
+     Known.Zero.setAllBits(); Known.One.setAllBits();
+     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+       if (!DemandedElts[i])
+         continue;
+ 
+       SDValue SrcOp = Op.getOperand(i);
+       Known2 = computeKnownBits(SrcOp, Depth + 1);
+ 
+       // BUILD_VECTOR can implicitly truncate sources, we must handle this.
+       if (SrcOp.getValueSizeInBits() != BitWidth) {
+         assert(SrcOp.getValueSizeInBits() > BitWidth &&
+                "Expected BUILD_VECTOR implicit truncation");
+         Known2 = Known2.trunc(BitWidth);
+       }
+ 
+       // Known bits are the values that are shared by every demanded element.
+       Known.One &= Known2.One;
+       Known.Zero &= Known2.Zero;
+ 
+       // If we don't know any bits, early out.
+       if (Known.isUnknown())
+         break;
+     }
+     break;
+   case ISD::VECTOR_SHUFFLE: {
+     // Collect the known bits that are shared by every vector element referenced
+     // by the shuffle.
+     APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
+     Known.Zero.setAllBits(); Known.One.setAllBits();
+     const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+     assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
+     for (unsigned i = 0; i != NumElts; ++i) {
+       if (!DemandedElts[i])
+         continue;
+ 
+       int M = SVN->getMaskElt(i);
+       if (M < 0) {
+         // For UNDEF elements, we don't know anything about the common state of
+         // the shuffle result.
+         Known.resetAll();
+         DemandedLHS.clearAllBits();
+         DemandedRHS.clearAllBits();
+         break;
+       }
+ 
+       if ((unsigned)M < NumElts)
+         DemandedLHS.setBit((unsigned)M % NumElts);
+       else
+         DemandedRHS.setBit((unsigned)M % NumElts);
+     }
+     // Known bits are the values that are shared by every demanded element.
+     if (!!DemandedLHS) {
+       SDValue LHS = Op.getOperand(0);
+       Known2 = computeKnownBits(LHS, DemandedLHS, Depth + 1);
+       Known.One &= Known2.One;
+       Known.Zero &= Known2.Zero;
+     }
+     // If we don't know any bits, early out.
+     if (Known.isUnknown())
+       break;
+     if (!!DemandedRHS) {
+       SDValue RHS = Op.getOperand(1);
+       Known2 = computeKnownBits(RHS, DemandedRHS, Depth + 1);
+       Known.One &= Known2.One;
+       Known.Zero &= Known2.Zero;
+     }
+     break;
+   }
+   case ISD::CONCAT_VECTORS: {
+     // Split DemandedElts and test each of the demanded subvectors.
+     Known.Zero.setAllBits(); Known.One.setAllBits();
+     EVT SubVectorVT = Op.getOperand(0).getValueType();
+     unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
+     unsigned NumSubVectors = Op.getNumOperands();
+     for (unsigned i = 0; i != NumSubVectors; ++i) {
+       APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
+       DemandedSub = DemandedSub.trunc(NumSubVectorElts);
+       if (!!DemandedSub) {
+         SDValue Sub = Op.getOperand(i);
+         Known2 = computeKnownBits(Sub, DemandedSub, Depth + 1);
+         Known.One &= Known2.One;
+         Known.Zero &= Known2.Zero;
+       }
+       // If we don't know any bits, early out.
+       if (Known.isUnknown())
+         break;
+     }
+     break;
+   }
+   case ISD::INSERT_SUBVECTOR: {
+     // If we know the element index, demand any elements from the subvector and
+     // the remainder from the src its inserted into, otherwise demand them all.
+     SDValue Src = Op.getOperand(0);
+     SDValue Sub = Op.getOperand(1);
+     ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+     if (SubIdx && SubIdx->getAPIntValue().ule(NumElts - NumSubElts)) {
+       Known.One.setAllBits();
+       Known.Zero.setAllBits();
+       uint64_t Idx = SubIdx->getZExtValue();
+       APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+       if (!!DemandedSubElts) {
+         Known = computeKnownBits(Sub, DemandedSubElts, Depth + 1);
+         if (Known.isUnknown())
+           break; // early-out.
+       }
+       APInt SubMask = APInt::getBitsSet(NumElts, Idx, Idx + NumSubElts);
+       APInt DemandedSrcElts = DemandedElts & ~SubMask;
+       if (!!DemandedSrcElts) {
+         Known2 = computeKnownBits(Src, DemandedSrcElts, Depth + 1);
+         Known.One &= Known2.One;
+         Known.Zero &= Known2.Zero;
+       }
+     } else {
+       Known = computeKnownBits(Sub, Depth + 1);
+       if (Known.isUnknown())
+         break; // early-out.
+       Known2 = computeKnownBits(Src, Depth + 1);
+       Known.One &= Known2.One;
+       Known.Zero &= Known2.Zero;
+     }
+     break;
+   }
+   case ISD::EXTRACT_SUBVECTOR: {
+     // If we know the element index, just demand that subvector elements,
+     // otherwise demand them all.
+     SDValue Src = Op.getOperand(0);
+     ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+     if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+       // Offset the demanded elts by the subvector index.
+       uint64_t Idx = SubIdx->getZExtValue();
+       APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+       Known = computeKnownBits(Src, DemandedSrc, Depth + 1);
+     } else {
+       Known = computeKnownBits(Src, Depth + 1);
+     }
+     break;
+   }
+   case ISD::SCALAR_TO_VECTOR: {
+     // We know about scalar_to_vector as much as we know about it source,
+     // which becomes the first element of otherwise unknown vector.
+     if (DemandedElts != 1)
+       break;
+ 
+     SDValue N0 = Op.getOperand(0);
+     Known = computeKnownBits(N0, Depth + 1);
+     if (N0.getValueSizeInBits() != BitWidth)
+       Known = Known.trunc(BitWidth);
+ 
+     break;
+   }
+   case ISD::BITCAST: {
+     SDValue N0 = Op.getOperand(0);
+     EVT SubVT = N0.getValueType();
+     unsigned SubBitWidth = SubVT.getScalarSizeInBits();
+ 
+     // Ignore bitcasts from unsupported types.
+     if (!(SubVT.isInteger() || SubVT.isFloatingPoint()))
+       break;
+ 
+     // Fast handling of 'identity' bitcasts.
+     if (BitWidth == SubBitWidth) {
+       Known = computeKnownBits(N0, DemandedElts, Depth + 1);
+       break;
+     }
+ 
+     bool IsLE = getDataLayout().isLittleEndian();
+ 
+     // Bitcast 'small element' vector to 'large element' scalar/vector.
+     if ((BitWidth % SubBitWidth) == 0) {
+       assert(N0.getValueType().isVector() && "Expected bitcast from vector");
+ 
+       // Collect known bits for the (larger) output by collecting the known
+       // bits from each set of sub elements and shift these into place.
+       // We need to separately call computeKnownBits for each set of
+       // sub elements as the knownbits for each is likely to be different.
+       unsigned SubScale = BitWidth / SubBitWidth;
+       APInt SubDemandedElts(NumElts * SubScale, 0);
+       for (unsigned i = 0; i != NumElts; ++i)
+         if (DemandedElts[i])
+           SubDemandedElts.setBit(i * SubScale);
+ 
+       for (unsigned i = 0; i != SubScale; ++i) {
+         Known2 = computeKnownBits(N0, SubDemandedElts.shl(i),
+                          Depth + 1);
+         unsigned Shifts = IsLE ? i : SubScale - 1 - i;
+         Known.One |= Known2.One.zext(BitWidth).shl(SubBitWidth * Shifts);
+         Known.Zero |= Known2.Zero.zext(BitWidth).shl(SubBitWidth * Shifts);
+       }
+     }
+ 
+     // Bitcast 'large element' scalar/vector to 'small element' vector.
+     if ((SubBitWidth % BitWidth) == 0) {
+       assert(Op.getValueType().isVector() && "Expected bitcast to vector");
+ 
+       // Collect known bits for the (smaller) output by collecting the known
+       // bits from the overlapping larger input elements and extracting the
+       // sub sections we actually care about.
+       unsigned SubScale = SubBitWidth / BitWidth;
+       APInt SubDemandedElts(NumElts / SubScale, 0);
+       for (unsigned i = 0; i != NumElts; ++i)
+         if (DemandedElts[i])
+           SubDemandedElts.setBit(i / SubScale);
+ 
+       Known2 = computeKnownBits(N0, SubDemandedElts, Depth + 1);
+ 
+       Known.Zero.setAllBits(); Known.One.setAllBits();
+       for (unsigned i = 0; i != NumElts; ++i)
+         if (DemandedElts[i]) {
+           unsigned Shifts = IsLE ? i : NumElts - 1 - i;
+           unsigned Offset = (Shifts % SubScale) * BitWidth;
+           Known.One &= Known2.One.lshr(Offset).trunc(BitWidth);
+           Known.Zero &= Known2.Zero.lshr(Offset).trunc(BitWidth);
+           // If we don't know any bits, early out.
+           if (Known.isUnknown())
+             break;
+         }
+     }
+     break;
+   }
+   case ISD::AND:
+     // If either the LHS or the RHS are Zero, the result is zero.
+     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ 
+     // Output known-1 bits are only known if set in both the LHS & RHS.
+     Known.One &= Known2.One;
+     // Output known-0 are known to be clear if zero in either the LHS | RHS.
+     Known.Zero |= Known2.Zero;
+     break;
+   case ISD::OR:
+     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ 
+     // Output known-0 bits are only known if clear in both the LHS & RHS.
+     Known.Zero &= Known2.Zero;
+     // Output known-1 are known to be set if set in either the LHS | RHS.
+     Known.One |= Known2.One;
+     break;
+   case ISD::XOR: {
+     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ 
+     // Output known-0 bits are known if clear or set in both the LHS & RHS.
+     APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
+     // Output known-1 are known to be set if set in only one of the LHS, RHS.
+     Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
+     Known.Zero = KnownZeroOut;
+     break;
+   }
+   case ISD::MUL: {
+     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ 
+     // If low bits are zero in either operand, output low known-0 bits.
+     // Also compute a conservative estimate for high known-0 bits.
+     // More trickiness is possible, but this is sufficient for the
+     // interesting case of alignment computation.
+     unsigned TrailZ = Known.countMinTrailingZeros() +
+                       Known2.countMinTrailingZeros();
+     unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
+                                Known2.countMinLeadingZeros(),
+                                BitWidth) - BitWidth;
+ 
+     Known.resetAll();
+     Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
+     Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
+     break;
+   }
+   case ISD::UDIV: {
+     // For the purposes of computing leading zeros we can conservatively
+     // treat a udiv as a logical right shift by the power of 2 known to
+     // be less than the denominator.
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     unsigned LeadZ = Known2.countMinLeadingZeros();
+ 
+     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+     unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
+     if (RHSMaxLeadingZeros != BitWidth)
+       LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
+ 
+     Known.Zero.setHighBits(LeadZ);
+     break;
+   }
+   case ISD::SELECT:
+   case ISD::VSELECT:
+     Known = computeKnownBits(Op.getOperand(2), DemandedElts, Depth+1);
+     // If we don't know any bits, early out.
+     if (Known.isUnknown())
+       break;
+     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth+1);
+ 
+     // Only known if known in both the LHS and RHS.
+     Known.One &= Known2.One;
+     Known.Zero &= Known2.Zero;
+     break;
+   case ISD::SELECT_CC:
+     Known = computeKnownBits(Op.getOperand(3), DemandedElts, Depth+1);
+     // If we don't know any bits, early out.
+     if (Known.isUnknown())
+       break;
+     Known2 = computeKnownBits(Op.getOperand(2), DemandedElts, Depth+1);
+ 
+     // Only known if known in both the LHS and RHS.
+     Known.One &= Known2.One;
+     Known.Zero &= Known2.Zero;
+     break;
+   case ISD::SMULO:
+   case ISD::UMULO:
+   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+     if (Op.getResNo() != 1)
+       break;
+     // The boolean result conforms to getBooleanContents.
+     // If we know the result of a setcc has the top bits zero, use this info.
+     // We know that we have an integer-based boolean since these operations
+     // are only available for integer.
+     if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
+             TargetLowering::ZeroOrOneBooleanContent &&
+         BitWidth > 1)
+       Known.Zero.setBitsFrom(1);
+     break;
+   case ISD::SETCC:
+     // If we know the result of a setcc has the top bits zero, use this info.
+     if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+             TargetLowering::ZeroOrOneBooleanContent &&
+         BitWidth > 1)
+       Known.Zero.setBitsFrom(1);
+     break;
+   case ISD::SHL:
+     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
+       Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+       unsigned Shift = ShAmt->getZExtValue();
+       Known.Zero <<= Shift;
+       Known.One <<= Shift;
+       // Low bits are known zero.
+       Known.Zero.setLowBits(Shift);
+     }
+     break;
+   case ISD::SRL:
+     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
+       Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+       unsigned Shift = ShAmt->getZExtValue();
+       Known.Zero.lshrInPlace(Shift);
+       Known.One.lshrInPlace(Shift);
+       // High bits are known zero.
+       Known.Zero.setHighBits(Shift);
+     } else if (auto *BV = dyn_cast<BuildVectorSDNode>(Op.getOperand(1))) {
+       // If the shift amount is a vector of constants see if we can bound
+       // the number of upper zero bits.
+       unsigned ShiftAmountMin = BitWidth;
+       for (unsigned i = 0; i != BV->getNumOperands(); ++i) {
+         if (auto *C = dyn_cast<ConstantSDNode>(BV->getOperand(i))) {
+           const APInt &ShAmt = C->getAPIntValue();
+           if (ShAmt.ult(BitWidth)) {
+             ShiftAmountMin = std::min<unsigned>(ShiftAmountMin,
+                                                 ShAmt.getZExtValue());
+             continue;
+           }
+         }
+         // Don't know anything.
+         ShiftAmountMin = 0;
+         break;
+       }
+ 
+       Known.Zero.setHighBits(ShiftAmountMin);
+     }
+     break;
+   case ISD::SRA:
+     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
+       Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+       unsigned Shift = ShAmt->getZExtValue();
+       // Sign extend known zero/one bit (else is unknown).
+       Known.Zero.ashrInPlace(Shift);
+       Known.One.ashrInPlace(Shift);
+     }
+     break;
+   case ISD::SIGN_EXTEND_INREG: {
+     EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+     unsigned EBits = EVT.getScalarSizeInBits();
+ 
+     // Sign extension.  Compute the demanded bits in the result that are not
+     // present in the input.
+     APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits);
+ 
+     APInt InSignMask = APInt::getSignMask(EBits);
+     APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits);
+ 
+     // If the sign extended bits are demanded, we know that the sign
+     // bit is demanded.
+     InSignMask = InSignMask.zext(BitWidth);
+     if (NewBits.getBoolValue())
+       InputDemandedBits |= InSignMask;
+ 
+     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     Known.One &= InputDemandedBits;
+     Known.Zero &= InputDemandedBits;
+ 
+     // If the sign bit of the input is known set or clear, then we know the
+     // top bits of the result.
+     if (Known.Zero.intersects(InSignMask)) {        // Input sign bit known clear
+       Known.Zero |= NewBits;
+       Known.One  &= ~NewBits;
+     } else if (Known.One.intersects(InSignMask)) {  // Input sign bit known set
+       Known.One  |= NewBits;
+       Known.Zero &= ~NewBits;
+     } else {                              // Input sign bit unknown
+       Known.Zero &= ~NewBits;
+       Known.One  &= ~NewBits;
+     }
+     break;
+   }
+   case ISD::CTTZ:
+   case ISD::CTTZ_ZERO_UNDEF: {
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     // If we have a known 1, its position is our upper bound.
+     unsigned PossibleTZ = Known2.countMaxTrailingZeros();
+     unsigned LowBits = Log2_32(PossibleTZ) + 1;
+     Known.Zero.setBitsFrom(LowBits);
+     break;
+   }
+   case ISD::CTLZ:
+   case ISD::CTLZ_ZERO_UNDEF: {
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     // If we have a known 1, its position is our upper bound.
+     unsigned PossibleLZ = Known2.countMaxLeadingZeros();
+     unsigned LowBits = Log2_32(PossibleLZ) + 1;
+     Known.Zero.setBitsFrom(LowBits);
+     break;
+   }
+   case ISD::CTPOP: {
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     // If we know some of the bits are zero, they can't be one.
+     unsigned PossibleOnes = Known2.countMaxPopulation();
+     Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
+     break;
+   }
+   case ISD::LOAD: {
+     LoadSDNode *LD = cast<LoadSDNode>(Op);
+     // If this is a ZEXTLoad and we are looking at the loaded value.
+     if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
+       EVT VT = LD->getMemoryVT();
+       unsigned MemBits = VT.getScalarSizeInBits();
+       Known.Zero.setBitsFrom(MemBits);
+     } else if (const MDNode *Ranges = LD->getRanges()) {
+       if (LD->getExtensionType() == ISD::NON_EXTLOAD)
+         computeKnownBitsFromRangeMetadata(*Ranges, Known);
+     }
+     break;
+   }
+   case ISD::ZERO_EXTEND_VECTOR_INREG: {
+     EVT InVT = Op.getOperand(0).getValueType();
+     APInt InDemandedElts = DemandedElts.zextOrSelf(InVT.getVectorNumElements());
+     Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
+     Known = Known.zext(BitWidth);
+     Known.Zero.setBitsFrom(InVT.getScalarSizeInBits());
+     break;
+   }
+   case ISD::ZERO_EXTEND: {
+     EVT InVT = Op.getOperand(0).getValueType();
+     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     Known = Known.zext(BitWidth);
+     Known.Zero.setBitsFrom(InVT.getScalarSizeInBits());
+     break;
+   }
+   // TODO ISD::SIGN_EXTEND_VECTOR_INREG
+   case ISD::SIGN_EXTEND: {
+     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     // If the sign bit is known to be zero or one, then sext will extend
+     // it to the top bits, else it will just zext.
+     Known = Known.sext(BitWidth);
+     break;
+   }
+   case ISD::ANY_EXTEND: {
+     Known = computeKnownBits(Op.getOperand(0), Depth+1);
+     Known = Known.zext(BitWidth);
+     break;
+   }
+   case ISD::TRUNCATE: {
+     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     Known = Known.trunc(BitWidth);
+     break;
+   }
+   case ISD::AssertZext: {
+     EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+     APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
+     Known = computeKnownBits(Op.getOperand(0), Depth+1);
+     Known.Zero |= (~InMask);
+     Known.One  &= (~Known.Zero);
+     break;
+   }
+   case ISD::FGETSIGN:
+     // All bits are zero except the low bit.
+     Known.Zero.setBitsFrom(1);
+     break;
+   case ISD::USUBO:
+   case ISD::SSUBO:
+     if (Op.getResNo() == 1) {
+       // If we know the result of a setcc has the top bits zero, use this info.
+       if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+               TargetLowering::ZeroOrOneBooleanContent &&
+           BitWidth > 1)
+         Known.Zero.setBitsFrom(1);
+       break;
+     }
+     LLVM_FALLTHROUGH;
+   case ISD::SUB:
+   case ISD::SUBC: {
+     if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) {
+       // We know that the top bits of C-X are clear if X contains less bits
+       // than C (i.e. no wrap-around can happen).  For example, 20-X is
+       // positive if we can prove that X is >= 0 and < 16.
+       if (CLHS->getAPIntValue().isNonNegative()) {
+         unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
+         // NLZ can't be BitWidth with no sign bit
+         APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
+         Known2 = computeKnownBits(Op.getOperand(1), DemandedElts,
+                          Depth + 1);
+ 
+         // If all of the MaskV bits are known to be zero, then we know the
+         // output top bits are zero, because we now know that the output is
+         // from [0-C].
+         if ((Known2.Zero & MaskV) == MaskV) {
+           unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
+           // Top bits known zero.
+           Known.Zero.setHighBits(NLZ2);
+         }
+       }
+     }
+ 
+     // If low bits are know to be zero in both operands, then we know they are
+     // going to be 0 in the result. Both addition and complement operations
+     // preserve the low zero bits.
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     unsigned KnownZeroLow = Known2.countMinTrailingZeros();
+     if (KnownZeroLow == 0)
+       break;
+ 
+     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+     KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
+     Known.Zero.setLowBits(KnownZeroLow);
+     break;
+   }
+   case ISD::UADDO:
+   case ISD::SADDO:
+   case ISD::ADDCARRY:
+     if (Op.getResNo() == 1) {
+       // If we know the result of a setcc has the top bits zero, use this info.
+       if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+               TargetLowering::ZeroOrOneBooleanContent &&
+           BitWidth > 1)
+         Known.Zero.setBitsFrom(1);
+       break;
+     }
+     LLVM_FALLTHROUGH;
+   case ISD::ADD:
+   case ISD::ADDC:
+   case ISD::ADDE: {
+     // Output known-0 bits are known if clear or set in both the low clear bits
+     // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
+     // low 3 bits clear.
+     // Output known-0 bits are also known if the top bits of each input are
+     // known to be clear. For example, if one input has the top 10 bits clear
+     // and the other has the top 8 bits clear, we know the top 7 bits of the
+     // output must be clear.
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
+     unsigned KnownZeroLow = Known2.countMinTrailingZeros();
+ 
+     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+     KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
+     KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
+ 
+     if (Opcode == ISD::ADDE || Opcode == ISD::ADDCARRY) {
+       // With ADDE and ADDCARRY, a carry bit may be added in, so we can only
+       // use this information if we know (at least) that the low two bits are
+       // clear. We then return to the caller that the low bit is unknown but
+       // that other bits are known zero.
+       if (KnownZeroLow >= 2)
+         Known.Zero.setBits(1, KnownZeroLow);
+       break;
+     }
+ 
+     Known.Zero.setLowBits(KnownZeroLow);
+     if (KnownZeroHigh > 1)
+       Known.Zero.setHighBits(KnownZeroHigh - 1);
+     break;
+   }
+   case ISD::SREM:
+     if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
+       const APInt &RA = Rem->getAPIntValue().abs();
+       if (RA.isPowerOf2()) {
+         APInt LowBits = RA - 1;
+         Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ 
+         // The low bits of the first operand are unchanged by the srem.
+         Known.Zero = Known2.Zero & LowBits;
+         Known.One = Known2.One & LowBits;
+ 
+         // If the first operand is non-negative or has all low bits zero, then
+         // the upper bits are all zero.
+         if (Known2.Zero[BitWidth-1] || ((Known2.Zero & LowBits) == LowBits))
+           Known.Zero |= ~LowBits;
+ 
+         // If the first operand is negative and not all low bits are zero, then
+         // the upper bits are all one.
+         if (Known2.One[BitWidth-1] && ((Known2.One & LowBits) != 0))
+           Known.One |= ~LowBits;
+         assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?");
+       }
+     }
+     break;
+   case ISD::UREM: {
+     if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
+       const APInt &RA = Rem->getAPIntValue();
+       if (RA.isPowerOf2()) {
+         APInt LowBits = (RA - 1);
+         Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ 
+         // The upper bits are all zero, the lower ones are unchanged.
+         Known.Zero = Known2.Zero | ~LowBits;
+         Known.One = Known2.One & LowBits;
+         break;
+       }
+     }
+ 
+     // Since the result is less than or equal to either operand, any leading
+     // zero bits in either operand must also exist in the result.
+     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ 
+     uint32_t Leaders =
+         std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
+     Known.resetAll();
+     Known.Zero.setHighBits(Leaders);
+     break;
+   }
+   case ISD::EXTRACT_ELEMENT: {
+     Known = computeKnownBits(Op.getOperand(0), Depth+1);
+     const unsigned Index = Op.getConstantOperandVal(1);
+     const unsigned BitWidth = Op.getValueSizeInBits();
+ 
+     // Remove low part of known bits mask
+     Known.Zero = Known.Zero.getHiBits(Known.Zero.getBitWidth() - Index * BitWidth);
+     Known.One = Known.One.getHiBits(Known.One.getBitWidth() - Index * BitWidth);
+ 
+     // Remove high part of known bit mask
+     Known = Known.trunc(BitWidth);
+     break;
+   }
+   case ISD::EXTRACT_VECTOR_ELT: {
+     SDValue InVec = Op.getOperand(0);
+     SDValue EltNo = Op.getOperand(1);
+     EVT VecVT = InVec.getValueType();
+     const unsigned BitWidth = Op.getValueSizeInBits();
+     const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
+     const unsigned NumSrcElts = VecVT.getVectorNumElements();
+     // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
+     // anything about the extended bits.
+     if (BitWidth > EltBitWidth)
+       Known = Known.trunc(EltBitWidth);
+     ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+     if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) {
+       // If we know the element index, just demand that vector element.
+       unsigned Idx = ConstEltNo->getZExtValue();
+       APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
+       Known = computeKnownBits(InVec, DemandedElt, Depth + 1);
+     } else {
+       // Unknown element index, so ignore DemandedElts and demand them all.
+       Known = computeKnownBits(InVec, Depth + 1);
+     }
+     if (BitWidth > EltBitWidth)
+       Known = Known.zext(BitWidth);
+     break;
+   }
+   case ISD::INSERT_VECTOR_ELT: {
+     SDValue InVec = Op.getOperand(0);
+     SDValue InVal = Op.getOperand(1);
+     SDValue EltNo = Op.getOperand(2);
+ 
+     ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
+     if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
+       // If we know the element index, split the demand between the
+       // source vector and the inserted element.
+       Known.Zero = Known.One = APInt::getAllOnesValue(BitWidth);
+       unsigned EltIdx = CEltNo->getZExtValue();
+ 
+       // If we demand the inserted element then add its common known bits.
+       if (DemandedElts[EltIdx]) {
+         Known2 = computeKnownBits(InVal, Depth + 1);
+         Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
+         Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
+       }
+ 
+       // If we demand the source vector then add its common known bits, ensuring
+       // that we don't demand the inserted element.
+       APInt VectorElts = DemandedElts & ~(APInt::getOneBitSet(NumElts, EltIdx));
+       if (!!VectorElts) {
+         Known2 = computeKnownBits(InVec, VectorElts, Depth + 1);
+         Known.One &= Known2.One;
+         Known.Zero &= Known2.Zero;
+       }
+     } else {
+       // Unknown element index, so ignore DemandedElts and demand them all.
+       Known = computeKnownBits(InVec, Depth + 1);
+       Known2 = computeKnownBits(InVal, Depth + 1);
+       Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
+       Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
+     }
+     break;
+   }
+   case ISD::BITREVERSE: {
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     Known.Zero = Known2.Zero.reverseBits();
+     Known.One = Known2.One.reverseBits();
+     break;
+   }
+   case ISD::BSWAP: {
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     Known.Zero = Known2.Zero.byteSwap();
+     Known.One = Known2.One.byteSwap();
+     break;
+   }
+   case ISD::ABS: {
+     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ 
+     // If the source's MSB is zero then we know the rest of the bits already.
+     if (Known2.isNonNegative()) {
+       Known.Zero = Known2.Zero;
+       Known.One = Known2.One;
+       break;
+     }
+ 
+     // We only know that the absolute values's MSB will be zero iff there is
+     // a set bit that isn't the sign bit (otherwise it could be INT_MIN).
+     Known2.One.clearSignBit();
+     if (Known2.One.getBoolValue()) {
+       Known.Zero = APInt::getSignMask(BitWidth);
+       break;
+     }
+     break;
+   }
+   case ISD::UMIN: {
+     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ 
+     // UMIN - we know that the result will have the maximum of the
+     // known zero leading bits of the inputs.
+     unsigned LeadZero = Known.countMinLeadingZeros();
+     LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());
+ 
+     Known.Zero &= Known2.Zero;
+     Known.One &= Known2.One;
+     Known.Zero.setHighBits(LeadZero);
+     break;
+   }
+   case ISD::UMAX: {
+     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ 
+     // UMAX - we know that the result will have the maximum of the
+     // known one leading bits of the inputs.
+     unsigned LeadOne = Known.countMinLeadingOnes();
+     LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());
+ 
+     Known.Zero &= Known2.Zero;
+     Known.One &= Known2.One;
+     Known.One.setHighBits(LeadOne);
+     break;
+   }
+   case ISD::SMIN:
+   case ISD::SMAX: {
+     // If we have a clamp pattern, we know that the number of sign bits will be
+     // the minimum of the clamp min/max range.
+     bool IsMax = (Opcode == ISD::SMAX);
+     ConstantSDNode *CstLow = nullptr, *CstHigh = nullptr;
+     if ((CstLow = isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)))
+       if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
+         CstHigh = isConstOrDemandedConstSplat(Op.getOperand(0).getOperand(1),
+                                               DemandedElts);
+     if (CstLow && CstHigh) {
+       if (!IsMax)
+         std::swap(CstLow, CstHigh);
+ 
+       const APInt &ValueLow = CstLow->getAPIntValue();
+       const APInt &ValueHigh = CstHigh->getAPIntValue();
+       if (ValueLow.sle(ValueHigh)) {
+         unsigned LowSignBits = ValueLow.getNumSignBits();
+         unsigned HighSignBits = ValueHigh.getNumSignBits();
+         unsigned MinSignBits = std::min(LowSignBits, HighSignBits);
+         if (ValueLow.isNegative() && ValueHigh.isNegative()) {
+           Known.One.setHighBits(MinSignBits);
+           break;
+         }
+         if (ValueLow.isNonNegative() && ValueHigh.isNonNegative()) {
+           Known.Zero.setHighBits(MinSignBits);
+           break;
+         }
+       }
+     }
+ 
+     // Fallback - just get the shared known bits of the operands.
+     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+     if (Known.isUnknown()) break; // Early-out
+     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+     Known.Zero &= Known2.Zero;
+     Known.One &= Known2.One;
+     break;
+   }
+   case ISD::FrameIndex:
+   case ISD::TargetFrameIndex:
+     TLI->computeKnownBitsForFrameIndex(Op, Known, DemandedElts, *this, Depth);
+     break;
+ 
+   default:
+     if (Opcode < ISD::BUILTIN_OP_END)
+       break;
+     LLVM_FALLTHROUGH;
+   case ISD::INTRINSIC_WO_CHAIN:
+   case ISD::INTRINSIC_W_CHAIN:
+   case ISD::INTRINSIC_VOID:
+     // Allow the target to implement this method for its nodes.
+     TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this, Depth);
+     break;
+   }
+ 
+   assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+   return Known;
+ }
+ 
+ SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0,
+                                                              SDValue N1) const {
+   // X + 0 never overflow
+   if (isNullConstant(N1))
+     return OFK_Never;
+ 
+   KnownBits N1Known;
+   computeKnownBits(N1, N1Known);
+   if (N1Known.Zero.getBoolValue()) {
+     KnownBits N0Known;
+     computeKnownBits(N0, N0Known);
+ 
+     bool overflow;
+     (void)(~N0Known.Zero).uadd_ov(~N1Known.Zero, overflow);
+     if (!overflow)
+       return OFK_Never;
+   }
+ 
+   // mulhi + 1 never overflow
+   if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 &&
+       (~N1Known.Zero & 0x01) == ~N1Known.Zero)
+     return OFK_Never;
+ 
+   if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) {
+     KnownBits N0Known;
+     computeKnownBits(N0, N0Known);
+ 
+     if ((~N0Known.Zero & 0x01) == ~N0Known.Zero)
+       return OFK_Never;
+   }
+ 
+   return OFK_Sometime;
+ }
+ 
+ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
+   EVT OpVT = Val.getValueType();
+   unsigned BitWidth = OpVT.getScalarSizeInBits();
+ 
+   // Is the constant a known power of 2?
+   if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val))
+     return Const->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
+ 
+   // A left-shift of a constant one will have exactly one bit set because
+   // shifting the bit off the end is undefined.
+   if (Val.getOpcode() == ISD::SHL) {
+     auto *C = isConstOrConstSplat(Val.getOperand(0));
+     if (C && C->getAPIntValue() == 1)
+       return true;
+   }
+ 
+   // Similarly, a logical right-shift of a constant sign-bit will have exactly
+   // one bit set.
+   if (Val.getOpcode() == ISD::SRL) {
+     auto *C = isConstOrConstSplat(Val.getOperand(0));
+     if (C && C->getAPIntValue().isSignMask())
+       return true;
+   }
+ 
+   // Are all operands of a build vector constant powers of two?
+   if (Val.getOpcode() == ISD::BUILD_VECTOR)
+     if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) {
+           if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E))
+             return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
+           return false;
+         }))
+       return true;
+ 
+   // More could be done here, though the above checks are enough
+   // to handle some common cases.
+ 
+   // Fall back to computeKnownBits to catch other known cases.
+   KnownBits Known = computeKnownBits(Val);
+   return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
+ }
+ 
+ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
+   EVT VT = Op.getValueType();
+   APInt DemandedElts = VT.isVector()
+                            ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                            : APInt(1, 1);
+   return ComputeNumSignBits(Op, DemandedElts, Depth);
+ }
+ 
+ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
+                                           unsigned Depth) const {
+   EVT VT = Op.getValueType();
+   assert((VT.isInteger() || VT.isFloatingPoint()) && "Invalid VT!");
+   unsigned VTBits = VT.getScalarSizeInBits();
+   unsigned NumElts = DemandedElts.getBitWidth();
+   unsigned Tmp, Tmp2;
+   unsigned FirstAnswer = 1;
+ 
+   if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+     const APInt &Val = C->getAPIntValue();
+     return Val.getNumSignBits();
+   }
+ 
+   if (Depth == 6)
+     return 1;  // Limit search depth.
+ 
+   if (!DemandedElts)
+     return 1;  // No demanded elts, better to assume we don't know anything.
+ 
+   unsigned Opcode = Op.getOpcode();
+   switch (Opcode) {
+   default: break;
+   case ISD::AssertSext:
+     Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+     return VTBits-Tmp+1;
+   case ISD::AssertZext:
+     Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+     return VTBits-Tmp;
+ 
+   case ISD::BUILD_VECTOR:
+     Tmp = VTBits;
+     for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
+       if (!DemandedElts[i])
+         continue;
+ 
+       SDValue SrcOp = Op.getOperand(i);
+       Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);
+ 
+       // BUILD_VECTOR can implicitly truncate sources, we must handle this.
+       if (SrcOp.getValueSizeInBits() != VTBits) {
+         assert(SrcOp.getValueSizeInBits() > VTBits &&
+                "Expected BUILD_VECTOR implicit truncation");
+         unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits;
+         Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1);
+       }
+       Tmp = std::min(Tmp, Tmp2);
+     }
+     return Tmp;
+ 
+   case ISD::VECTOR_SHUFFLE: {
+     // Collect the minimum number of sign bits that are shared by every vector
+     // element referenced by the shuffle.
+     APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
+     const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+     assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
+     for (unsigned i = 0; i != NumElts; ++i) {
+       int M = SVN->getMaskElt(i);
+       if (!DemandedElts[i])
+         continue;
+       // For UNDEF elements, we don't know anything about the common state of
+       // the shuffle result.
+       if (M < 0)
+         return 1;
+       if ((unsigned)M < NumElts)
+         DemandedLHS.setBit((unsigned)M % NumElts);
+       else
+         DemandedRHS.setBit((unsigned)M % NumElts);
+     }
+     Tmp = std::numeric_limits<unsigned>::max();
+     if (!!DemandedLHS)
+       Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
+     if (!!DemandedRHS) {
+       Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
+       Tmp = std::min(Tmp, Tmp2);
+     }
+     // If we don't know anything, early out and try computeKnownBits fall-back.
+     if (Tmp == 1)
+       break;
+     assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+     return Tmp;
+   }
+ 
+   case ISD::BITCAST: {
+     SDValue N0 = Op.getOperand(0);
+     EVT SrcVT = N0.getValueType();
+     unsigned SrcBits = SrcVT.getScalarSizeInBits();
+ 
+     // Ignore bitcasts from unsupported types..
+     if (!(SrcVT.isInteger() || SrcVT.isFloatingPoint()))
+       break;
+ 
+     // Fast handling of 'identity' bitcasts.
+     if (VTBits == SrcBits)
+       return ComputeNumSignBits(N0, DemandedElts, Depth + 1);
+ 
+     bool IsLE = getDataLayout().isLittleEndian();
+ 
+     // Bitcast 'large element' scalar/vector to 'small element' vector.
+     if ((SrcBits % VTBits) == 0) {
+       assert(VT.isVector() && "Expected bitcast to vector");
+ 
+       unsigned Scale = SrcBits / VTBits;
+       APInt SrcDemandedElts(NumElts / Scale, 0);
+       for (unsigned i = 0; i != NumElts; ++i)
+         if (DemandedElts[i])
+           SrcDemandedElts.setBit(i / Scale);
+ 
+       // Fast case - sign splat can be simply split across the small elements.
+       Tmp = ComputeNumSignBits(N0, SrcDemandedElts, Depth + 1);
+       if (Tmp == SrcBits)
+         return VTBits;
+ 
+       // Slow case - determine how far the sign extends into each sub-element.
+       Tmp2 = VTBits;
+       for (unsigned i = 0; i != NumElts; ++i)
+         if (DemandedElts[i]) {
+           unsigned SubOffset = i % Scale;
+           SubOffset = (IsLE ? ((Scale - 1) - SubOffset) : SubOffset);
+           SubOffset = SubOffset * VTBits;
+           if (Tmp <= SubOffset)
+             return 1;
+           Tmp2 = std::min(Tmp2, Tmp - SubOffset);
+         }
+       return Tmp2;
+     }
+     break;
+   }
+ 
+   case ISD::SIGN_EXTEND:
+     Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
+     return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1) + Tmp;
+   case ISD::SIGN_EXTEND_INREG:
+     // Max of the input and what this extends.
+     Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarSizeInBits();
+     Tmp = VTBits-Tmp+1;
+     Tmp2 = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
+     return std::max(Tmp, Tmp2);
+   case ISD::SIGN_EXTEND_VECTOR_INREG: {
+     SDValue Src = Op.getOperand(0);
+     EVT SrcVT = Src.getValueType();
+     APInt DemandedSrcElts = DemandedElts.zextOrSelf(SrcVT.getVectorNumElements());
+     Tmp = VTBits - SrcVT.getScalarSizeInBits();
+     return ComputeNumSignBits(Src, DemandedSrcElts, Depth+1) + Tmp;
+   }
+ 
+   case ISD::SRA:
+     Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
+     // SRA X, C   -> adds C sign bits.
+     if (ConstantSDNode *C =
+             isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)) {
+       APInt ShiftVal = C->getAPIntValue();
+       ShiftVal += Tmp;
+       Tmp = ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
+     }
+     return Tmp;
+   case ISD::SHL:
+     if (ConstantSDNode *C =
+             isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)) {
+       // shl destroys sign bits.
+       Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
+       if (C->getAPIntValue().uge(VTBits) ||      // Bad shift.
+           C->getAPIntValue().uge(Tmp)) break;    // Shifted all sign bits out.
+       return Tmp - C->getZExtValue();
+     }
+     break;
+   case ISD::AND:
+   case ISD::OR:
+   case ISD::XOR:    // NOT is handled here.
+     // Logical binary ops preserve the number of sign bits at the worst.
+     Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
+     if (Tmp != 1) {
+       Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
+       FirstAnswer = std::min(Tmp, Tmp2);
+       // We computed what we know about the sign bits as our first
+       // answer. Now proceed to the generic code that uses
+       // computeKnownBits, and pick whichever answer is better.
+     }
+     break;
+ 
+   case ISD::SELECT:
+   case ISD::VSELECT:
+     Tmp = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
+     if (Tmp == 1) return 1;  // Early out.
+     Tmp2 = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
+     return std::min(Tmp, Tmp2);
+   case ISD::SELECT_CC:
+     Tmp = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
+     if (Tmp == 1) return 1;  // Early out.
+     Tmp2 = ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth+1);
+     return std::min(Tmp, Tmp2);
+ 
+   case ISD::SMIN:
+   case ISD::SMAX: {
+     // If we have a clamp pattern, we know that the number of sign bits will be
+     // the minimum of the clamp min/max range.
+     bool IsMax = (Opcode == ISD::SMAX);
+     ConstantSDNode *CstLow = nullptr, *CstHigh = nullptr;
+     if ((CstLow = isConstOrDemandedConstSplat(Op.getOperand(1), DemandedElts)))
+       if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
+         CstHigh = isConstOrDemandedConstSplat(Op.getOperand(0).getOperand(1),
+                                               DemandedElts);
+     if (CstLow && CstHigh) {
+       if (!IsMax)
+         std::swap(CstLow, CstHigh);
+       if (CstLow->getAPIntValue().sle(CstHigh->getAPIntValue())) {
+         Tmp = CstLow->getAPIntValue().getNumSignBits();
+         Tmp2 = CstHigh->getAPIntValue().getNumSignBits();
+         return std::min(Tmp, Tmp2);
+       }
+     }
+ 
+     // Fallback - just get the minimum number of sign bits of the operands.
+     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+     if (Tmp == 1)
+       return 1;  // Early out.
+     Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+     return std::min(Tmp, Tmp2);
+   }
+   case ISD::UMIN:
+   case ISD::UMAX:
+     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+     if (Tmp == 1)
+       return 1;  // Early out.
+     Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+     return std::min(Tmp, Tmp2);
+   case ISD::SADDO:
+   case ISD::UADDO:
+   case ISD::SSUBO:
+   case ISD::USUBO:
+   case ISD::SMULO:
+   case ISD::UMULO:
+     if (Op.getResNo() != 1)
+       break;
+     // The boolean result conforms to getBooleanContents.  Fall through.
+     // If setcc returns 0/-1, all bits are sign bits.
+     // We know that we have an integer-based boolean since these operations
+     // are only available for integer.
+     if (TLI->getBooleanContents(VT.isVector(), false) ==
+         TargetLowering::ZeroOrNegativeOneBooleanContent)
+       return VTBits;
+     break;
+   case ISD::SETCC:
+     // If setcc returns 0/-1, all bits are sign bits.
+     if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+         TargetLowering::ZeroOrNegativeOneBooleanContent)
+       return VTBits;
+     break;
+   case ISD::ROTL:
+   case ISD::ROTR:
+     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+       unsigned RotAmt = C->getAPIntValue().urem(VTBits);
+ 
+       // Handle rotate right by N like a rotate left by 32-N.
+       if (Opcode == ISD::ROTR)
+         RotAmt = (VTBits - RotAmt) % VTBits;
+ 
+       // If we aren't rotating out all of the known-in sign bits, return the
+       // number that are left.  This handles rotl(sext(x), 1) for example.
+       Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+       if (Tmp > (RotAmt + 1)) return (Tmp - RotAmt);
+     }
+     break;
+   case ISD::ADD:
+   case ISD::ADDC:
+     // Add can have at most one carry bit.  Thus we know that the output
+     // is, at worst, one more bit than the inputs.
+     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+     if (Tmp == 1) return 1;  // Early out.
+ 
+     // Special case decrementing a value (ADD X, -1):
+     if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+       if (CRHS->isAllOnesValue()) {
+         KnownBits Known = computeKnownBits(Op.getOperand(0), Depth+1);
+ 
+         // If the input is known to be 0 or 1, the output is 0/-1, which is all
+         // sign bits set.
+         if ((Known.Zero | 1).isAllOnesValue())
+           return VTBits;
+ 
+         // If we are subtracting one from a positive number, there is no carry
+         // out of the result.
+         if (Known.isNonNegative())
+           return Tmp;
+       }
+ 
+     Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+     if (Tmp2 == 1) return 1;
+     return std::min(Tmp, Tmp2)-1;
+ 
+   case ISD::SUB:
+     Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+     if (Tmp2 == 1) return 1;
+ 
+     // Handle NEG.
+     if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0)))
+       if (CLHS->isNullValue()) {
+         KnownBits Known = computeKnownBits(Op.getOperand(1), Depth+1);
+         // If the input is known to be 0 or 1, the output is 0/-1, which is all
+         // sign bits set.
+         if ((Known.Zero | 1).isAllOnesValue())
+           return VTBits;
+ 
+         // If the input is known to be positive (the sign bit is known clear),
+         // the output of the NEG has the same number of sign bits as the input.
+         if (Known.isNonNegative())
+           return Tmp2;
+ 
+         // Otherwise, we treat this like a SUB.
+       }
+ 
+     // Sub can have at most one carry bit.  Thus we know that the output
+     // is, at worst, one more bit than the inputs.
+     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+     if (Tmp == 1) return 1;  // Early out.
+     return std::min(Tmp, Tmp2)-1;
+   case ISD::TRUNCATE: {
+     // Check if the sign bits of source go down as far as the truncated value.
+     unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
+     unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+     if (NumSrcSignBits > (NumSrcBits - VTBits))
+       return NumSrcSignBits - (NumSrcBits - VTBits);
+     break;
+   }
+   case ISD::EXTRACT_ELEMENT: {
+     const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+     const int BitWidth = Op.getValueSizeInBits();
+     const int Items = Op.getOperand(0).getValueSizeInBits() / BitWidth;
+ 
+     // Get reverse index (starting from 1), Op1 value indexes elements from
+     // little end. Sign starts at big end.
+     const int rIndex = Items - 1 - Op.getConstantOperandVal(1);
+ 
+     // If the sign portion ends in our element the subtraction gives correct
+     // result. Otherwise it gives either negative or > bitwidth result
+     return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
+   }
+   case ISD::INSERT_VECTOR_ELT: {
+     SDValue InVec = Op.getOperand(0);
+     SDValue InVal = Op.getOperand(1);
+     SDValue EltNo = Op.getOperand(2);
+     unsigned NumElts = InVec.getValueType().getVectorNumElements();
+ 
+     ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
+     if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
+       // If we know the element index, split the demand between the
+       // source vector and the inserted element.
+       unsigned EltIdx = CEltNo->getZExtValue();
+ 
+       // If we demand the inserted element then get its sign bits.
+       Tmp = std::numeric_limits<unsigned>::max();
+       if (DemandedElts[EltIdx]) {
+         // TODO - handle implicit truncation of inserted elements.
+         if (InVal.getScalarValueSizeInBits() != VTBits)
+           break;
+         Tmp = ComputeNumSignBits(InVal, Depth + 1);
+       }
+ 
+       // If we demand the source vector then get its sign bits, and determine
+       // the minimum.
+       APInt VectorElts = DemandedElts;
+       VectorElts.clearBit(EltIdx);
+       if (!!VectorElts) {
+         Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1);
+         Tmp = std::min(Tmp, Tmp2);
+       }
+     } else {
+       // Unknown element index, so ignore DemandedElts and demand them all.
+       Tmp = ComputeNumSignBits(InVec, Depth + 1);
+       Tmp2 = ComputeNumSignBits(InVal, Depth + 1);
+       Tmp = std::min(Tmp, Tmp2);
+     }
+     assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+     return Tmp;
+   }
+   case ISD::EXTRACT_VECTOR_ELT: {
+     SDValue InVec = Op.getOperand(0);
+     SDValue EltNo = Op.getOperand(1);
+     EVT VecVT = InVec.getValueType();
+     const unsigned BitWidth = Op.getValueSizeInBits();
+     const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
+     const unsigned NumSrcElts = VecVT.getVectorNumElements();
+ 
+     // If BitWidth > EltBitWidth the value is anyext:ed, and we do not know
+     // anything about sign bits. But if the sizes match we can derive knowledge
+     // about sign bits from the vector operand.
+     if (BitWidth != EltBitWidth)
+       break;
+ 
+     // If we know the element index, just demand that vector element, else for
+     // an unknown element index, ignore DemandedElts and demand them all.
+     APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
+     ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+     if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
+       DemandedSrcElts =
+           APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
+ 
+     return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
+   }
+   case ISD::EXTRACT_SUBVECTOR: {
+     // If we know the element index, just demand that subvector elements,
+     // otherwise demand them all.
+     SDValue Src = Op.getOperand(0);
+     ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+     if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+       // Offset the demanded elts by the subvector index.
+       uint64_t Idx = SubIdx->getZExtValue();
+       APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+       return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
+     }
+     return ComputeNumSignBits(Src, Depth + 1);
+   }
+   case ISD::CONCAT_VECTORS:
+     // Determine the minimum number of sign bits across all demanded
+     // elts of the input vectors. Early out if the result is already 1.
+     Tmp = std::numeric_limits<unsigned>::max();
+     EVT SubVectorVT = Op.getOperand(0).getValueType();
+     unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
+     unsigned NumSubVectors = Op.getNumOperands();
+     for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
+       APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
+       DemandedSub = DemandedSub.trunc(NumSubVectorElts);
+       if (!DemandedSub)
+         continue;
+       Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
+       Tmp = std::min(Tmp, Tmp2);
+     }
+     assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+     return Tmp;
+   }
+ 
+   // If we are looking at the loaded value of the SDNode.
+   if (Op.getResNo() == 0) {
+     // Handle LOADX separately here. EXTLOAD case will fallthrough.
+     if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
+       unsigned ExtType = LD->getExtensionType();
+       switch (ExtType) {
+         default: break;
+         case ISD::SEXTLOAD:    // '17' bits known
+           Tmp = LD->getMemoryVT().getScalarSizeInBits();
+           return VTBits-Tmp+1;
+         case ISD::ZEXTLOAD:    // '16' bits known
+           Tmp = LD->getMemoryVT().getScalarSizeInBits();
+           return VTBits-Tmp;
+       }
+     }
+   }
+ 
+   // Allow the target to implement this method for its nodes.
+   if (Opcode >= ISD::BUILTIN_OP_END ||
+       Opcode == ISD::INTRINSIC_WO_CHAIN ||
+       Opcode == ISD::INTRINSIC_W_CHAIN ||
+       Opcode == ISD::INTRINSIC_VOID) {
+     unsigned NumBits =
+         TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
+     if (NumBits > 1)
+       FirstAnswer = std::max(FirstAnswer, NumBits);
+   }
+ 
+   // Finally, if we can prove that the top bits of the result are 0's or 1's,
+   // use this information.
+   KnownBits Known = computeKnownBits(Op, DemandedElts, Depth);
+ 
+   APInt Mask;
+   if (Known.isNonNegative()) {        // sign bit is 0
+     Mask = Known.Zero;
+   } else if (Known.isNegative()) {  // sign bit is 1;
+     Mask = Known.One;
+   } else {
+     // Nothing known.
+     return FirstAnswer;
+   }
+ 
+   // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
+   // the number of identical bits in the top of the input value.
+   Mask = ~Mask;
+   Mask <<= Mask.getBitWidth()-VTBits;
+   // Return # leading zeros.  We use 'min' here in case Val was zero before
+   // shifting.  We don't want to return '64' as for an i32 "0".
+   return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
+ }
+ 
+ bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
+   if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) ||
+       !isa<ConstantSDNode>(Op.getOperand(1)))
+     return false;
+ 
+   if (Op.getOpcode() == ISD::OR &&
+       !MaskedValueIsZero(Op.getOperand(0),
+                      cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue()))
+     return false;
+ 
+   return true;
+ }
+ 
+ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const {
+   // If we're told that NaNs won't happen, assume they won't.
+   if (getTarget().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs())
+     return true;
+ 
+   if (Depth == 6)
+     return false; // Limit search depth.
+ 
+   // TODO: Handle vectors.
+   // If the value is a constant, we can obviously see if it is a NaN or not.
+   if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
+     return !C->getValueAPF().isNaN() ||
+            (SNaN && !C->getValueAPF().isSignaling());
+   }
+ 
+   unsigned Opcode = Op.getOpcode();
+   switch (Opcode) {
+   case ISD::FADD:
+   case ISD::FSUB:
+   case ISD::FMUL:
+   case ISD::FDIV:
+   case ISD::FREM:
+   case ISD::FSIN:
+   case ISD::FCOS: {
+     if (SNaN)
+       return true;
+     // TODO: Need isKnownNeverInfinity
+     return false;
+   }
+   case ISD::FCANONICALIZE:
+   case ISD::FEXP:
+   case ISD::FEXP2:
+   case ISD::FTRUNC:
+   case ISD::FFLOOR:
+   case ISD::FCEIL:
+   case ISD::FROUND:
+   case ISD::FRINT:
+   case ISD::FNEARBYINT: {
+     if (SNaN)
+       return true;
+     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+   }
+   case ISD::FABS:
+   case ISD::FNEG:
+   case ISD::FCOPYSIGN: {
+     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+   }
+   case ISD::SELECT:
+     return isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
+            isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
+   case ISD::FP_EXTEND:
+   case ISD::FP_ROUND: {
+     if (SNaN)
+       return true;
+     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+   }
+   case ISD::SINT_TO_FP:
+   case ISD::UINT_TO_FP:
+     return true;
+   case ISD::FMA:
+   case ISD::FMAD: {
+     if (SNaN)
+       return true;
+     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+            isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
+            isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
+   }
+   case ISD::FSQRT: // Need is known positive
+   case ISD::FLOG:
+   case ISD::FLOG2:
+   case ISD::FLOG10:
+   case ISD::FPOWI:
+   case ISD::FPOW: {
+     if (SNaN)
+       return true;
+     // TODO: Refine on operand
+     return false;
+   }
+   case ISD::FMINNUM:
+   case ISD::FMAXNUM: {
+     // Only one needs to be known not-nan, since it will be returned if the
+     // other ends up being one.
+     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) ||
+            isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+   }
+   case ISD::FMINNUM_IEEE:
+   case ISD::FMAXNUM_IEEE: {
+     if (SNaN)
+       return true;
+     // This can return a NaN if either operand is an sNaN, or if both operands
+     // are NaN.
+     return (isKnownNeverNaN(Op.getOperand(0), false, Depth + 1) &&
+             isKnownNeverSNaN(Op.getOperand(1), Depth + 1)) ||
+            (isKnownNeverNaN(Op.getOperand(1), false, Depth + 1) &&
+             isKnownNeverSNaN(Op.getOperand(0), Depth + 1));
+   }
+   case ISD::FMINIMUM:
+   case ISD::FMAXIMUM: {
+     // TODO: Does this quiet or return the origina NaN as-is?
+     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+            isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+   }
+   case ISD::EXTRACT_VECTOR_ELT: {
+     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+   }
+   default:
+     if (Opcode >= ISD::BUILTIN_OP_END ||
+         Opcode == ISD::INTRINSIC_WO_CHAIN ||
+         Opcode == ISD::INTRINSIC_W_CHAIN ||
+         Opcode == ISD::INTRINSIC_VOID) {
+       return TLI->isKnownNeverNaNForTargetNode(Op, *this, SNaN, Depth);
+     }
+ 
+     return false;
+   }
+ }
+ 
+ bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const {
+   assert(Op.getValueType().isFloatingPoint() &&
+          "Floating point type expected");
+ 
+   // If the value is a constant, we can obviously see if it is a zero or not.
+   // TODO: Add BuildVector support.
+   if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
+     return !C->isZero();
+   return false;
+ }
+ 
+ bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
+   assert(!Op.getValueType().isFloatingPoint() &&
+          "Floating point types unsupported - use isKnownNeverZeroFloat");
+ 
+   // If the value is a constant, we can obviously see if it is a zero or not.
+   if (ISD::matchUnaryPredicate(
+           Op, [](ConstantSDNode *C) { return !C->isNullValue(); }))
+     return true;
+ 
+   // TODO: Recognize more cases here.
+   switch (Op.getOpcode()) {
+   default: break;
+   case ISD::OR:
+     if (isKnownNeverZero(Op.getOperand(1)) ||
+         isKnownNeverZero(Op.getOperand(0)))
+       return true;
+     break;
+   }
+ 
+   return false;
+ }
+ 
+ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
+   // Check the obvious case.
+   if (A == B) return true;
+ 
+   // For for negative and positive zero.
+   if (const ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A))
+     if (const ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B))
+       if (CA->isZero() && CB->isZero()) return true;
+ 
+   // Otherwise they may not be equal.
+   return false;
+ }
+ 
+ // FIXME: unify with llvm::haveNoCommonBitsSet.
+ // FIXME: could also handle masked merge pattern (X & ~M) op (Y & M)
+ bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
+   assert(A.getValueType() == B.getValueType() &&
+          "Values must have the same type");
+   return (computeKnownBits(A).Zero | computeKnownBits(B).Zero).isAllOnesValue();
+ }
+ 
+ static SDValue FoldBUILD_VECTOR(const SDLoc &DL, EVT VT,
+                                 ArrayRef<SDValue> Ops,
+                                 SelectionDAG &DAG) {
+   int NumOps = Ops.size();
+   assert(NumOps != 0 && "Can't build an empty vector!");
+   assert(VT.getVectorNumElements() == (unsigned)NumOps &&
+          "Incorrect element count in BUILD_VECTOR!");
+ 
+   // BUILD_VECTOR of UNDEFs is UNDEF.
+   if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
+     return DAG.getUNDEF(VT);
+ 
+   // BUILD_VECTOR of seq extract/insert from the same vector + type is Identity.
+   SDValue IdentitySrc;
+   bool IsIdentity = true;
+   for (int i = 0; i != NumOps; ++i) {
+     if (Ops[i].getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+         Ops[i].getOperand(0).getValueType() != VT ||
+         (IdentitySrc && Ops[i].getOperand(0) != IdentitySrc) ||
+         !isa<ConstantSDNode>(Ops[i].getOperand(1)) ||
+         cast<ConstantSDNode>(Ops[i].getOperand(1))->getAPIntValue() != i) {
+       IsIdentity = false;
+       break;
+     }
+     IdentitySrc = Ops[i].getOperand(0);
+   }
+   if (IsIdentity)
+     return IdentitySrc;
+ 
+   return SDValue();
+ }
+ 
+ static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
+                                   ArrayRef<SDValue> Ops,
+                                   SelectionDAG &DAG) {
+   assert(!Ops.empty() && "Can't concatenate an empty list of vectors!");
+   assert(llvm::all_of(Ops,
+                       [Ops](SDValue Op) {
+                         return Ops[0].getValueType() == Op.getValueType();
+                       }) &&
+          "Concatenation of vectors with inconsistent value types!");
+   assert((Ops.size() * Ops[0].getValueType().getVectorNumElements()) ==
+              VT.getVectorNumElements() &&
+          "Incorrect element count in vector concatenation!");
+ 
+   if (Ops.size() == 1)
+     return Ops[0];
+ 
+   // Concat of UNDEFs is UNDEF.
+   if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
+     return DAG.getUNDEF(VT);
+ 
+   // A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
+   // simplified to one big BUILD_VECTOR.
+   // FIXME: Add support for SCALAR_TO_VECTOR as well.
+   EVT SVT = VT.getScalarType();
+   SmallVector<SDValue, 16> Elts;
+   for (SDValue Op : Ops) {
+     EVT OpVT = Op.getValueType();
+     if (Op.isUndef())
+       Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
+     else if (Op.getOpcode() == ISD::BUILD_VECTOR)
+       Elts.append(Op->op_begin(), Op->op_end());
+     else
+       return SDValue();
+   }
+ 
+   // BUILD_VECTOR requires all inputs to be of the same type, find the
+   // maximum type and extend them all.
+   for (SDValue Op : Elts)
+     SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
+ 
+   if (SVT.bitsGT(VT.getScalarType()))
+     for (SDValue &Op : Elts)
+       Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
+                ? DAG.getZExtOrTrunc(Op, DL, SVT)
+                : DAG.getSExtOrTrunc(Op, DL, SVT);
+ 
+   SDValue V = DAG.getBuildVector(VT, DL, Elts);
+   NewSDValueDbgMsg(V, "New node fold concat vectors: ", &DAG);
+   return V;
+ }
+ 
+ /// Gets or creates the specified node.
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, Opcode, getVTList(VT), None);
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                               getVTList(VT));
+   CSEMap.InsertNode(N, IP);
+ 
+   InsertNode(N);
+   SDValue V = SDValue(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                               SDValue Operand, const SDNodeFlags Flags) {
+   // Constant fold unary operations with an integer constant operand. Even
+   // opaque constant will be folded, because the folding of unary operations
+   // doesn't create new constants with different values. Nevertheless, the
+   // opaque flag is preserved during folding to prevent future folding with
+   // other constants.
+   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand)) {
+     const APInt &Val = C->getAPIntValue();
+     switch (Opcode) {
+     default: break;
+     case ISD::SIGN_EXTEND:
+       return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT,
+                          C->isTargetOpcode(), C->isOpaque());
+     case ISD::TRUNCATE:
+       if (C->isOpaque())
+         break;
+       LLVM_FALLTHROUGH;
+     case ISD::ANY_EXTEND:
+     case ISD::ZERO_EXTEND:
+       return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT,
+                          C->isTargetOpcode(), C->isOpaque());
+     case ISD::UINT_TO_FP:
+     case ISD::SINT_TO_FP: {
+       APFloat apf(EVTToAPFloatSemantics(VT),
+                   APInt::getNullValue(VT.getSizeInBits()));
+       (void)apf.convertFromAPInt(Val,
+                                  Opcode==ISD::SINT_TO_FP,
+                                  APFloat::rmNearestTiesToEven);
+       return getConstantFP(apf, DL, VT);
+     }
+     case ISD::BITCAST:
+       if (VT == MVT::f16 && C->getValueType(0) == MVT::i16)
+         return getConstantFP(APFloat(APFloat::IEEEhalf(), Val), DL, VT);
+       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
+         return getConstantFP(APFloat(APFloat::IEEEsingle(), Val), DL, VT);
+       if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
+         return getConstantFP(APFloat(APFloat::IEEEdouble(), Val), DL, VT);
+       if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
+         return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT);
+       break;
+     case ISD::ABS:
+       return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(),
+                          C->isOpaque());
+     case ISD::BITREVERSE:
+       return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(),
+                          C->isOpaque());
+     case ISD::BSWAP:
+       return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
+                          C->isOpaque());
+     case ISD::CTPOP:
+       return getConstant(Val.countPopulation(), DL, VT, C->isTargetOpcode(),
+                          C->isOpaque());
+     case ISD::CTLZ:
+     case ISD::CTLZ_ZERO_UNDEF:
+       return getConstant(Val.countLeadingZeros(), DL, VT, C->isTargetOpcode(),
+                          C->isOpaque());
+     case ISD::CTTZ:
+     case ISD::CTTZ_ZERO_UNDEF:
+       return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(),
+                          C->isOpaque());
+     case ISD::FP16_TO_FP: {
+       bool Ignored;
+       APFloat FPV(APFloat::IEEEhalf(),
+                   (Val.getBitWidth() == 16) ? Val : Val.trunc(16));
+ 
+       // This can return overflow, underflow, or inexact; we don't care.
+       // FIXME need to be more flexible about rounding mode.
+       (void)FPV.convert(EVTToAPFloatSemantics(VT),
+                         APFloat::rmNearestTiesToEven, &Ignored);
+       return getConstantFP(FPV, DL, VT);
+     }
+     }
+   }
+ 
+   // Constant fold unary operations with a floating point constant operand.
+   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand)) {
+     APFloat V = C->getValueAPF();    // make copy
+     switch (Opcode) {
+     case ISD::FNEG:
+       V.changeSign();
+       return getConstantFP(V, DL, VT);
+     case ISD::FABS:
+       V.clearSign();
+       return getConstantFP(V, DL, VT);
+     case ISD::FCEIL: {
+       APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
+       if (fs == APFloat::opOK || fs == APFloat::opInexact)
+         return getConstantFP(V, DL, VT);
+       break;
+     }
+     case ISD::FTRUNC: {
+       APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
+       if (fs == APFloat::opOK || fs == APFloat::opInexact)
+         return getConstantFP(V, DL, VT);
+       break;
+     }
+     case ISD::FFLOOR: {
+       APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
+       if (fs == APFloat::opOK || fs == APFloat::opInexact)
+         return getConstantFP(V, DL, VT);
+       break;
+     }
+     case ISD::FP_EXTEND: {
+       bool ignored;
+       // This can return overflow, underflow, or inexact; we don't care.
+       // FIXME need to be more flexible about rounding mode.
+       (void)V.convert(EVTToAPFloatSemantics(VT),
+                       APFloat::rmNearestTiesToEven, &ignored);
+       return getConstantFP(V, DL, VT);
+     }
+     case ISD::FP_TO_SINT:
+     case ISD::FP_TO_UINT: {
+       bool ignored;
+       APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT);
+       // FIXME need to be more flexible about rounding mode.
+       APFloat::opStatus s =
+           V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored);
+       if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual
+         break;
+       return getConstant(IntVal, DL, VT);
+     }
+     case ISD::BITCAST:
+       if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
+         return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
+       else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
+         return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
+       else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
+         return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
+       break;
+     case ISD::FP_TO_FP16: {
+       bool Ignored;
+       // This can return overflow, underflow, or inexact; we don't care.
+       // FIXME need to be more flexible about rounding mode.
+       (void)V.convert(APFloat::IEEEhalf(),
+                       APFloat::rmNearestTiesToEven, &Ignored);
+       return getConstant(V.bitcastToAPInt(), DL, VT);
+     }
+     }
+   }
+ 
+   // Constant fold unary operations with a vector integer or float operand.
+   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Operand)) {
+     if (BV->isConstant()) {
+       switch (Opcode) {
+       default:
+         // FIXME: Entirely reasonable to perform folding of other unary
+         // operations here as the need arises.
+         break;
+       case ISD::FNEG:
+       case ISD::FABS:
+       case ISD::FCEIL:
+       case ISD::FTRUNC:
+       case ISD::FFLOOR:
+       case ISD::FP_EXTEND:
+       case ISD::FP_TO_SINT:
+       case ISD::FP_TO_UINT:
+       case ISD::TRUNCATE:
+       case ISD::ANY_EXTEND:
+       case ISD::ZERO_EXTEND:
+       case ISD::SIGN_EXTEND:
+       case ISD::UINT_TO_FP:
+       case ISD::SINT_TO_FP:
+       case ISD::ABS:
+       case ISD::BITREVERSE:
+       case ISD::BSWAP:
+       case ISD::CTLZ:
+       case ISD::CTLZ_ZERO_UNDEF:
+       case ISD::CTTZ:
+       case ISD::CTTZ_ZERO_UNDEF:
+       case ISD::CTPOP: {
+         SDValue Ops = { Operand };
+         if (SDValue Fold = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
+           return Fold;
+       }
+       }
+     }
+   }
+ 
+   unsigned OpOpcode = Operand.getNode()->getOpcode();
+   switch (Opcode) {
+   case ISD::TokenFactor:
+   case ISD::MERGE_VALUES:
+   case ISD::CONCAT_VECTORS:
+     return Operand;         // Factor, merge or concat of one node?  No need.
+   case ISD::BUILD_VECTOR: {
+     // Attempt to simplify BUILD_VECTOR.
+     SDValue Ops[] = {Operand};
+     if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+       return V;
+     break;
+   }
+   case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
+   case ISD::FP_EXTEND:
+     assert(VT.isFloatingPoint() &&
+            Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
+     if (Operand.getValueType() == VT) return Operand;  // noop conversion.
+     assert((!VT.isVector() ||
+             VT.getVectorNumElements() ==
+             Operand.getValueType().getVectorNumElements()) &&
+            "Vector element count mismatch!");
+     assert(Operand.getValueType().bitsLT(VT) &&
+            "Invalid fpext node, dst < src!");
+     if (Operand.isUndef())
+       return getUNDEF(VT);
+     break;
+   case ISD::SIGN_EXTEND:
+     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+            "Invalid SIGN_EXTEND!");
+     if (Operand.getValueType() == VT) return Operand;   // noop extension
+     assert((!VT.isVector() ||
+             VT.getVectorNumElements() ==
+             Operand.getValueType().getVectorNumElements()) &&
+            "Vector element count mismatch!");
+     assert(Operand.getValueType().bitsLT(VT) &&
+            "Invalid sext node, dst < src!");
+     if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
+       return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
+     else if (OpOpcode == ISD::UNDEF)
+       // sext(undef) = 0, because the top bits will all be the same.
+       return getConstant(0, DL, VT);
+     break;
+   case ISD::ZERO_EXTEND:
+     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+            "Invalid ZERO_EXTEND!");
+     if (Operand.getValueType() == VT) return Operand;   // noop extension
+     assert((!VT.isVector() ||
+             VT.getVectorNumElements() ==
+             Operand.getValueType().getVectorNumElements()) &&
+            "Vector element count mismatch!");
+     assert(Operand.getValueType().bitsLT(VT) &&
+            "Invalid zext node, dst < src!");
+     if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
+       return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0));
+     else if (OpOpcode == ISD::UNDEF)
+       // zext(undef) = 0, because the top bits will be zero.
+       return getConstant(0, DL, VT);
+     break;
+   case ISD::ANY_EXTEND:
+     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+            "Invalid ANY_EXTEND!");
+     if (Operand.getValueType() == VT) return Operand;   // noop extension
+     assert((!VT.isVector() ||
+             VT.getVectorNumElements() ==
+             Operand.getValueType().getVectorNumElements()) &&
+            "Vector element count mismatch!");
+     assert(Operand.getValueType().bitsLT(VT) &&
+            "Invalid anyext node, dst < src!");
+ 
+     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+         OpOpcode == ISD::ANY_EXTEND)
+       // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
+       return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
+     else if (OpOpcode == ISD::UNDEF)
+       return getUNDEF(VT);
+ 
+     // (ext (trunc x)) -> x
+     if (OpOpcode == ISD::TRUNCATE) {
+       SDValue OpOp = Operand.getOperand(0);
+       if (OpOp.getValueType() == VT) {
+         transferDbgValues(Operand, OpOp);
+         return OpOp;
+       }
+     }
+     break;
+   case ISD::TRUNCATE:
+     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+            "Invalid TRUNCATE!");
+     if (Operand.getValueType() == VT) return Operand;   // noop truncate
+     assert((!VT.isVector() ||
+             VT.getVectorNumElements() ==
+             Operand.getValueType().getVectorNumElements()) &&
+            "Vector element count mismatch!");
+     assert(Operand.getValueType().bitsGT(VT) &&
+            "Invalid truncate node, src < dst!");
+     if (OpOpcode == ISD::TRUNCATE)
+       return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
+     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+         OpOpcode == ISD::ANY_EXTEND) {
+       // If the source is smaller than the dest, we still need an extend.
+       if (Operand.getOperand(0).getValueType().getScalarType()
+             .bitsLT(VT.getScalarType()))
+         return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
+       if (Operand.getOperand(0).getValueType().bitsGT(VT))
+         return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
+       return Operand.getOperand(0);
+     }
+     if (OpOpcode == ISD::UNDEF)
+       return getUNDEF(VT);
+     break;
+   case ISD::ANY_EXTEND_VECTOR_INREG:
+   case ISD::ZERO_EXTEND_VECTOR_INREG:
+   case ISD::SIGN_EXTEND_VECTOR_INREG:
+     assert(VT.isVector() && "This DAG node is restricted to vector types.");
+     assert(Operand.getValueType().bitsLE(VT) &&
+            "The input must be the same size or smaller than the result.");
+     assert(VT.getVectorNumElements() <
+              Operand.getValueType().getVectorNumElements() &&
+            "The destination vector type must have fewer lanes than the input.");
+     break;
+   case ISD::ABS:
+     assert(VT.isInteger() && VT == Operand.getValueType() &&
+            "Invalid ABS!");
+     if (OpOpcode == ISD::UNDEF)
+       return getUNDEF(VT);
+     break;
+   case ISD::BSWAP:
+     assert(VT.isInteger() && VT == Operand.getValueType() &&
+            "Invalid BSWAP!");
+     assert((VT.getScalarSizeInBits() % 16 == 0) &&
+            "BSWAP types must be a multiple of 16 bits!");
+     if (OpOpcode == ISD::UNDEF)
+       return getUNDEF(VT);
+     break;
+   case ISD::BITREVERSE:
+     assert(VT.isInteger() && VT == Operand.getValueType() &&
+            "Invalid BITREVERSE!");
+     if (OpOpcode == ISD::UNDEF)
+       return getUNDEF(VT);
+     break;
+   case ISD::BITCAST:
+     // Basic sanity checking.
+     assert(VT.getSizeInBits() == Operand.getValueSizeInBits() &&
+            "Cannot BITCAST between types of different sizes!");
+     if (VT == Operand.getValueType()) return Operand;  // noop conversion.
+     if (OpOpcode == ISD::BITCAST)  // bitconv(bitconv(x)) -> bitconv(x)
+       return getNode(ISD::BITCAST, DL, VT, Operand.getOperand(0));
+     if (OpOpcode == ISD::UNDEF)
+       return getUNDEF(VT);
+     break;
+   case ISD::SCALAR_TO_VECTOR:
+     assert(VT.isVector() && !Operand.getValueType().isVector() &&
+            (VT.getVectorElementType() == Operand.getValueType() ||
+             (VT.getVectorElementType().isInteger() &&
+              Operand.getValueType().isInteger() &&
+              VT.getVectorElementType().bitsLE(Operand.getValueType()))) &&
+            "Illegal SCALAR_TO_VECTOR node!");
+     if (OpOpcode == ISD::UNDEF)
+       return getUNDEF(VT);
+     // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined.
+     if (OpOpcode == ISD::EXTRACT_VECTOR_ELT &&
+         isa<ConstantSDNode>(Operand.getOperand(1)) &&
+         Operand.getConstantOperandVal(1) == 0 &&
+         Operand.getOperand(0).getValueType() == VT)
+       return Operand.getOperand(0);
+     break;
+   case ISD::FNEG:
+     // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
+     if ((getTarget().Options.UnsafeFPMath || Flags.hasNoSignedZeros()) &&
+         OpOpcode == ISD::FSUB)
+       return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
+                      Operand.getOperand(0), Flags);
+     if (OpOpcode == ISD::FNEG)  // --X -> X
+       return Operand.getOperand(0);
+     break;
+   case ISD::FABS:
+     if (OpOpcode == ISD::FNEG)  // abs(-X) -> abs(X)
+       return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
+     break;
+   }
+ 
+   SDNode *N;
+   SDVTList VTs = getVTList(VT);
+   SDValue Ops[] = {Operand};
+   if (VT != MVT::Glue) { // Don't CSE flag producing nodes
+     FoldingSetNodeID ID;
+     AddNodeIDNode(ID, Opcode, VTs, Ops);
+     void *IP = nullptr;
+     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+       E->intersectFlagsWith(Flags);
+       return SDValue(E, 0);
+     }
+ 
+     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+     N->setFlags(Flags);
+     createOperands(N, Ops);
+     CSEMap.InsertNode(N, IP);
+   } else {
+     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+     createOperands(N, Ops);
+   }
+ 
+   InsertNode(N);
+   SDValue V = SDValue(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ static std::pair<APInt, bool> FoldValue(unsigned Opcode, const APInt &C1,
+                                         const APInt &C2) {
+   switch (Opcode) {
+   case ISD::ADD:  return std::make_pair(C1 + C2, true);
+   case ISD::SUB:  return std::make_pair(C1 - C2, true);
+   case ISD::MUL:  return std::make_pair(C1 * C2, true);
+   case ISD::AND:  return std::make_pair(C1 & C2, true);
+   case ISD::OR:   return std::make_pair(C1 | C2, true);
+   case ISD::XOR:  return std::make_pair(C1 ^ C2, true);
+   case ISD::SHL:  return std::make_pair(C1 << C2, true);
+   case ISD::SRL:  return std::make_pair(C1.lshr(C2), true);
+   case ISD::SRA:  return std::make_pair(C1.ashr(C2), true);
+   case ISD::ROTL: return std::make_pair(C1.rotl(C2), true);
+   case ISD::ROTR: return std::make_pair(C1.rotr(C2), true);
+   case ISD::SMIN: return std::make_pair(C1.sle(C2) ? C1 : C2, true);
+   case ISD::SMAX: return std::make_pair(C1.sge(C2) ? C1 : C2, true);
+   case ISD::UMIN: return std::make_pair(C1.ule(C2) ? C1 : C2, true);
+   case ISD::UMAX: return std::make_pair(C1.uge(C2) ? C1 : C2, true);
+   case ISD::UDIV:
+     if (!C2.getBoolValue())
+       break;
+     return std::make_pair(C1.udiv(C2), true);
+   case ISD::UREM:
+     if (!C2.getBoolValue())
+       break;
+     return std::make_pair(C1.urem(C2), true);
+   case ISD::SDIV:
+     if (!C2.getBoolValue())
+       break;
+     return std::make_pair(C1.sdiv(C2), true);
+   case ISD::SREM:
+     if (!C2.getBoolValue())
+       break;
+     return std::make_pair(C1.srem(C2), true);
+   }
+   return std::make_pair(APInt(1, 0), false);
+ }
+ 
+ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
+                                              EVT VT, const ConstantSDNode *Cst1,
+                                              const ConstantSDNode *Cst2) {
+   if (Cst1->isOpaque() || Cst2->isOpaque())
+     return SDValue();
+ 
+   std::pair<APInt, bool> Folded = FoldValue(Opcode, Cst1->getAPIntValue(),
+                                             Cst2->getAPIntValue());
+   if (!Folded.second)
+     return SDValue();
+   return getConstant(Folded.first, DL, VT);
+ }
+ 
+ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
+                                        const GlobalAddressSDNode *GA,
+                                        const SDNode *N2) {
+   if (GA->getOpcode() != ISD::GlobalAddress)
+     return SDValue();
+   if (!TLI->isOffsetFoldingLegal(GA))
+     return SDValue();
+   const ConstantSDNode *Cst2 = dyn_cast<ConstantSDNode>(N2);
+   if (!Cst2)
+     return SDValue();
+   int64_t Offset = Cst2->getSExtValue();
+   switch (Opcode) {
+   case ISD::ADD: break;
+   case ISD::SUB: Offset = -uint64_t(Offset); break;
+   default: return SDValue();
+   }
+   return getGlobalAddress(GA->getGlobal(), SDLoc(Cst2), VT,
+                           GA->getOffset() + uint64_t(Offset));
+ }
+ 
+ bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
+   switch (Opcode) {
+   case ISD::SDIV:
+   case ISD::UDIV:
+   case ISD::SREM:
+   case ISD::UREM: {
+     // If a divisor is zero/undef or any element of a divisor vector is
+     // zero/undef, the whole op is undef.
+     assert(Ops.size() == 2 && "Div/rem should have 2 operands");
+     SDValue Divisor = Ops[1];
+     if (Divisor.isUndef() || isNullConstant(Divisor))
+       return true;
+ 
+     return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
+            llvm::any_of(Divisor->op_values(),
+                         [](SDValue V) { return V.isUndef() ||
+                                         isNullConstant(V); });
+     // TODO: Handle signed overflow.
+   }
+   // TODO: Handle oversized shifts.
+   default:
+     return false;
+   }
+ }
+ 
+ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
+                                              EVT VT, SDNode *Cst1,
+                                              SDNode *Cst2) {
+   // If the opcode is a target-specific ISD node, there's nothing we can
+   // do here and the operand rules may not line up with the below, so
+   // bail early.
+   if (Opcode >= ISD::BUILTIN_OP_END)
+     return SDValue();
+ 
+   if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)}))
+     return getUNDEF(VT);
+ 
+   // Handle the case of two scalars.
+   if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) {
+     if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) {
+       SDValue Folded = FoldConstantArithmetic(Opcode, DL, VT, Scalar1, Scalar2);
+       assert((!Folded || !VT.isVector()) &&
+              "Can't fold vectors ops with scalar operands");
+       return Folded;
+     }
+   }
+ 
+   // fold (add Sym, c) -> Sym+c
+   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst1))
+     return FoldSymbolOffset(Opcode, VT, GA, Cst2);
+   if (TLI->isCommutativeBinOp(Opcode))
+     if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst2))
+       return FoldSymbolOffset(Opcode, VT, GA, Cst1);
+ 
+   // For vectors, extract each constant element and fold them individually.
+   // Either input may be an undef value.
+   auto *BV1 = dyn_cast<BuildVectorSDNode>(Cst1);
+   if (!BV1 && !Cst1->isUndef())
+     return SDValue();
+   auto *BV2 = dyn_cast<BuildVectorSDNode>(Cst2);
+   if (!BV2 && !Cst2->isUndef())
+     return SDValue();
+   // If both operands are undef, that's handled the same way as scalars.
+   if (!BV1 && !BV2)
+     return SDValue();
+ 
+   assert((!BV1 || !BV2 || BV1->getNumOperands() == BV2->getNumOperands()) &&
+          "Vector binop with different number of elements in operands?");
+ 
+   EVT SVT = VT.getScalarType();
+   EVT LegalSVT = SVT;
+   if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
+     LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
+     if (LegalSVT.bitsLT(SVT))
+       return SDValue();
+   }
+   SmallVector<SDValue, 4> Outputs;
+   unsigned NumOps = BV1 ? BV1->getNumOperands() : BV2->getNumOperands();
+   for (unsigned I = 0; I != NumOps; ++I) {
+     SDValue V1 = BV1 ? BV1->getOperand(I) : getUNDEF(SVT);
+     SDValue V2 = BV2 ? BV2->getOperand(I) : getUNDEF(SVT);
+     if (SVT.isInteger()) {
+       if (V1->getValueType(0).bitsGT(SVT))
+         V1 = getNode(ISD::TRUNCATE, DL, SVT, V1);
+       if (V2->getValueType(0).bitsGT(SVT))
+         V2 = getNode(ISD::TRUNCATE, DL, SVT, V2);
+     }
+ 
+     if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT)
+       return SDValue();
+ 
+     // Fold one vector element.
+     SDValue ScalarResult = getNode(Opcode, DL, SVT, V1, V2);
+     if (LegalSVT != SVT)
+       ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
+ 
+     // Scalar folding only succeeded if the result is a constant or UNDEF.
+     if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
+         ScalarResult.getOpcode() != ISD::ConstantFP)
+       return SDValue();
+     Outputs.push_back(ScalarResult);
+   }
+ 
+   assert(VT.getVectorNumElements() == Outputs.size() &&
+          "Vector size mismatch!");
+ 
+   // We may have a vector type but a scalar result. Create a splat.
+   Outputs.resize(VT.getVectorNumElements(), Outputs.back());
+ 
+   // Build a big vector out of the scalar elements we generated.
+   return getBuildVector(VT, SDLoc(), Outputs);
+ }
+ 
+ // TODO: Merge with FoldConstantArithmetic
+ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
+                                                    const SDLoc &DL, EVT VT,
+                                                    ArrayRef<SDValue> Ops,
+                                                    const SDNodeFlags Flags) {
+   // If the opcode is a target-specific ISD node, there's nothing we can
+   // do here and the operand rules may not line up with the below, so
+   // bail early.
+   if (Opcode >= ISD::BUILTIN_OP_END)
+     return SDValue();
+ 
+   if (isUndef(Opcode, Ops))
+     return getUNDEF(VT);
+ 
+   // We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
+   if (!VT.isVector())
+     return SDValue();
+ 
+   unsigned NumElts = VT.getVectorNumElements();
+ 
+   auto IsScalarOrSameVectorSize = [&](const SDValue &Op) {
+     return !Op.getValueType().isVector() ||
+            Op.getValueType().getVectorNumElements() == NumElts;
+   };
+ 
+   auto IsConstantBuildVectorOrUndef = [&](const SDValue &Op) {
+     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op);
+     return (Op.isUndef()) || (Op.getOpcode() == ISD::CONDCODE) ||
+            (BV && BV->isConstant());
+   };
+ 
+   // All operands must be vector types with the same number of elements as
+   // the result type and must be either UNDEF or a build vector of constant
+   // or UNDEF scalars.
+   if (!llvm::all_of(Ops, IsConstantBuildVectorOrUndef) ||
+       !llvm::all_of(Ops, IsScalarOrSameVectorSize))
+     return SDValue();
+ 
+   // If we are comparing vectors, then the result needs to be a i1 boolean
+   // that is then sign-extended back to the legal result type.
+   EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());
+ 
+   // Find legal integer scalar type for constant promotion and
+   // ensure that its scalar size is at least as large as source.
+   EVT LegalSVT = VT.getScalarType();
+   if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
+     LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
+     if (LegalSVT.bitsLT(VT.getScalarType()))
+       return SDValue();
+   }
+ 
+   // Constant fold each scalar lane separately.
+   SmallVector<SDValue, 4> ScalarResults;
+   for (unsigned i = 0; i != NumElts; i++) {
+     SmallVector<SDValue, 4> ScalarOps;
+     for (SDValue Op : Ops) {
+       EVT InSVT = Op.getValueType().getScalarType();
+       BuildVectorSDNode *InBV = dyn_cast<BuildVectorSDNode>(Op);
+       if (!InBV) {
+         // We've checked that this is UNDEF or a constant of some kind.
+         if (Op.isUndef())
+           ScalarOps.push_back(getUNDEF(InSVT));
+         else
+           ScalarOps.push_back(Op);
+         continue;
+       }
+ 
+       SDValue ScalarOp = InBV->getOperand(i);
+       EVT ScalarVT = ScalarOp.getValueType();
+ 
+       // Build vector (integer) scalar operands may need implicit
+       // truncation - do this before constant folding.
+       if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT))
+         ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp);
+ 
+       ScalarOps.push_back(ScalarOp);
+     }
+ 
+     // Constant fold the scalar operands.
+     SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);
+ 
+     // Legalize the (integer) scalar constant if necessary.
+     if (LegalSVT != SVT)
+       ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
+ 
+     // Scalar folding only succeeded if the result is a constant or UNDEF.
+     if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
+         ScalarResult.getOpcode() != ISD::ConstantFP)
+       return SDValue();
+     ScalarResults.push_back(ScalarResult);
+   }
+ 
+   SDValue V = getBuildVector(VT, DL, ScalarResults);
+   NewSDValueDbgMsg(V, "New node fold constant vector: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                               SDValue N1, SDValue N2, const SDNodeFlags Flags) {
+   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+   ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+   ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
+ 
+   // Canonicalize constant to RHS if commutative.
+   if (TLI->isCommutativeBinOp(Opcode)) {
+     if (N1C && !N2C) {
+       std::swap(N1C, N2C);
+       std::swap(N1, N2);
+     } else if (N1CFP && !N2CFP) {
+       std::swap(N1CFP, N2CFP);
+       std::swap(N1, N2);
+     }
+   }
+ 
+   switch (Opcode) {
+   default: break;
+   case ISD::TokenFactor:
+     assert(VT == MVT::Other && N1.getValueType() == MVT::Other &&
+            N2.getValueType() == MVT::Other && "Invalid token factor!");
+     // Fold trivial token factors.
+     if (N1.getOpcode() == ISD::EntryToken) return N2;
+     if (N2.getOpcode() == ISD::EntryToken) return N1;
+     if (N1 == N2) return N1;
+     break;
+   case ISD::BUILD_VECTOR: {
+     // Attempt to simplify BUILD_VECTOR.
+     SDValue Ops[] = {N1, N2};
+     if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+       return V;
+     break;
+   }
+   case ISD::CONCAT_VECTORS: {
+     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+     SDValue Ops[] = {N1, N2};
+     if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+       return V;
+     break;
+   }
+   case ISD::AND:
+     assert(VT.isInteger() && "This operator does not apply to FP types!");
+     assert(N1.getValueType() == N2.getValueType() &&
+            N1.getValueType() == VT && "Binary operator types must match!");
+     // (X & 0) -> 0.  This commonly occurs when legalizing i64 values, so it's
+     // worth handling here.
+     if (N2C && N2C->isNullValue())
+       return N2;
+     if (N2C && N2C->isAllOnesValue())  // X & -1 -> X
+       return N1;
+     break;
+   case ISD::OR:
+   case ISD::XOR:
+   case ISD::ADD:
+   case ISD::SUB:
+     assert(VT.isInteger() && "This operator does not apply to FP types!");
+     assert(N1.getValueType() == N2.getValueType() &&
+            N1.getValueType() == VT && "Binary operator types must match!");
+     // (X ^|+- 0) -> X.  This commonly occurs when legalizing i64 values, so
+     // it's worth handling here.
+     if (N2C && N2C->isNullValue())
+       return N1;
+     break;
+   case ISD::UDIV:
+   case ISD::UREM:
+   case ISD::MULHU:
+   case ISD::MULHS:
+   case ISD::MUL:
+   case ISD::SDIV:
+   case ISD::SREM:
+   case ISD::SMIN:
+   case ISD::SMAX:
+   case ISD::UMIN:
+   case ISD::UMAX:
+     assert(VT.isInteger() && "This operator does not apply to FP types!");
+     assert(N1.getValueType() == N2.getValueType() &&
+            N1.getValueType() == VT && "Binary operator types must match!");
+     break;
+   case ISD::FADD:
+   case ISD::FSUB:
+   case ISD::FMUL:
+   case ISD::FDIV:
+   case ISD::FREM:
+     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
+     assert(N1.getValueType() == N2.getValueType() &&
+            N1.getValueType() == VT && "Binary operator types must match!");
+     break;
+   case ISD::FCOPYSIGN:   // N1 and result must match.  N1/N2 need not match.
+     assert(N1.getValueType() == VT &&
+            N1.getValueType().isFloatingPoint() &&
+            N2.getValueType().isFloatingPoint() &&
+            "Invalid FCOPYSIGN!");
+     break;
+   case ISD::SHL:
+   case ISD::SRA:
+   case ISD::SRL:
+     if (SDValue V = simplifyShift(N1, N2))
+       return V;
+     LLVM_FALLTHROUGH;
+   case ISD::ROTL:
+   case ISD::ROTR:
+     assert(VT == N1.getValueType() &&
+            "Shift operators return type must be the same as their first arg");
+     assert(VT.isInteger() && N2.getValueType().isInteger() &&
+            "Shifts only work on integers");
+     assert((!VT.isVector() || VT == N2.getValueType()) &&
+            "Vector shift amounts must be in the same as their first arg");
+     // Verify that the shift amount VT is big enough to hold valid shift
+     // amounts.  This catches things like trying to shift an i1024 value by an
+     // i8, which is easy to fall into in generic code that uses
+     // TLI.getShiftAmount().
+     assert(N2.getValueSizeInBits() >= Log2_32_Ceil(N1.getValueSizeInBits()) &&
+            "Invalid use of small shift amount with oversized value!");
+ 
+     // Always fold shifts of i1 values so the code generator doesn't need to
+     // handle them.  Since we know the size of the shift has to be less than the
+     // size of the value, the shift/rotate count is guaranteed to be zero.
+     if (VT == MVT::i1)
+       return N1;
+     if (N2C && N2C->isNullValue())
+       return N1;
+     break;
+   case ISD::FP_ROUND_INREG: {
+     EVT EVT = cast<VTSDNode>(N2)->getVT();
+     assert(VT == N1.getValueType() && "Not an inreg round!");
+     assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
+            "Cannot FP_ROUND_INREG integer types");
+     assert(EVT.isVector() == VT.isVector() &&
+            "FP_ROUND_INREG type should be vector iff the operand "
+            "type is vector!");
+     assert((!EVT.isVector() ||
+             EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
+            "Vector element counts must match in FP_ROUND_INREG");
+     assert(EVT.bitsLE(VT) && "Not rounding down!");
+     (void)EVT;
+     if (cast<VTSDNode>(N2)->getVT() == VT) return N1;  // Not actually rounding.
+     break;
+   }
+   case ISD::FP_ROUND:
+     assert(VT.isFloatingPoint() &&
+            N1.getValueType().isFloatingPoint() &&
+            VT.bitsLE(N1.getValueType()) &&
+            N2C && (N2C->getZExtValue() == 0 || N2C->getZExtValue() == 1) &&
+            "Invalid FP_ROUND!");
+     if (N1.getValueType() == VT) return N1;  // noop conversion.
+     break;
+   case ISD::AssertSext:
+   case ISD::AssertZext: {
+     EVT EVT = cast<VTSDNode>(N2)->getVT();
+     assert(VT == N1.getValueType() && "Not an inreg extend!");
+     assert(VT.isInteger() && EVT.isInteger() &&
+            "Cannot *_EXTEND_INREG FP types");
+     assert(!EVT.isVector() &&
+            "AssertSExt/AssertZExt type should be the vector element type "
+            "rather than the vector type!");
+     assert(EVT.bitsLE(VT) && "Not extending!");
+     if (VT == EVT) return N1; // noop assertion.
+     break;
+   }
+   case ISD::SIGN_EXTEND_INREG: {
+     EVT EVT = cast<VTSDNode>(N2)->getVT();
+     assert(VT == N1.getValueType() && "Not an inreg extend!");
+     assert(VT.isInteger() && EVT.isInteger() &&
+            "Cannot *_EXTEND_INREG FP types");
+     assert(EVT.isVector() == VT.isVector() &&
+            "SIGN_EXTEND_INREG type should be vector iff the operand "
+            "type is vector!");
+     assert((!EVT.isVector() ||
+             EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
+            "Vector element counts must match in SIGN_EXTEND_INREG");
+     assert(EVT.bitsLE(VT) && "Not extending!");
+     if (EVT == VT) return N1;  // Not actually extending
+ 
+     auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) {
+       unsigned FromBits = EVT.getScalarSizeInBits();
+       Val <<= Val.getBitWidth() - FromBits;
+       Val.ashrInPlace(Val.getBitWidth() - FromBits);
+       return getConstant(Val, DL, ConstantVT);
+     };
+ 
+     if (N1C) {
+       const APInt &Val = N1C->getAPIntValue();
+       return SignExtendInReg(Val, VT);
+     }
+     if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
+       SmallVector<SDValue, 8> Ops;
+       llvm::EVT OpVT = N1.getOperand(0).getValueType();
+       for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+         SDValue Op = N1.getOperand(i);
+         if (Op.isUndef()) {
+           Ops.push_back(getUNDEF(OpVT));
+           continue;
+         }
+         ConstantSDNode *C = cast<ConstantSDNode>(Op);
+         APInt Val = C->getAPIntValue();
+         Ops.push_back(SignExtendInReg(Val, OpVT));
+       }
+       return getBuildVector(VT, DL, Ops);
+     }
+     break;
+   }
+   case ISD::EXTRACT_VECTOR_ELT:
+     assert(VT.getSizeInBits() >= N1.getValueType().getScalarSizeInBits() &&
+            "The result of EXTRACT_VECTOR_ELT must be at least as wide as the \
+              element type of the vector.");
+ 
+     // EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
+     if (N1.isUndef())
+       return getUNDEF(VT);
+ 
+     // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
+     if (N2C && N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
+       return getUNDEF(VT);
+ 
+     // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
+     // expanding copies of large vectors from registers.
+     if (N2C &&
+         N1.getOpcode() == ISD::CONCAT_VECTORS &&
+         N1.getNumOperands() > 0) {
+       unsigned Factor =
+         N1.getOperand(0).getValueType().getVectorNumElements();
+       return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                      N1.getOperand(N2C->getZExtValue() / Factor),
+                      getConstant(N2C->getZExtValue() % Factor, DL,
+                                  N2.getValueType()));
+     }
+ 
+     // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
+     // expanding large vector constants.
+     if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
+       SDValue Elt = N1.getOperand(N2C->getZExtValue());
+ 
+       if (VT != Elt.getValueType())
+         // If the vector element type is not legal, the BUILD_VECTOR operands
+         // are promoted and implicitly truncated, and the result implicitly
+         // extended. Make that explicit here.
+         Elt = getAnyExtOrTrunc(Elt, DL, VT);
+ 
+       return Elt;
+     }
+ 
+     // EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector
+     // operations are lowered to scalars.
+     if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+       // If the indices are the same, return the inserted element else
+       // if the indices are known different, extract the element from
+       // the original vector.
+       SDValue N1Op2 = N1.getOperand(2);
+       ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2);
+ 
+       if (N1Op2C && N2C) {
+         if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
+           if (VT == N1.getOperand(1).getValueType())
+             return N1.getOperand(1);
+           else
+             return getSExtOrTrunc(N1.getOperand(1), DL, VT);
+         }
+ 
+         return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
+       }
+     }
+ 
+     // EXTRACT_VECTOR_ELT of v1iX EXTRACT_SUBVECTOR could be formed
+     // when vector types are scalarized and v1iX is legal.
+     // vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx)
+     if (N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+         N1.getValueType().getVectorNumElements() == 1) {
+       return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0),
+                      N1.getOperand(1));
+     }
+     break;
+   case ISD::EXTRACT_ELEMENT:
+     assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
+     assert(!N1.getValueType().isVector() && !VT.isVector() &&
+            (N1.getValueType().isInteger() == VT.isInteger()) &&
+            N1.getValueType() != VT &&
+            "Wrong types for EXTRACT_ELEMENT!");
+ 
+     // EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
+     // 64-bit integers into 32-bit parts.  Instead of building the extract of
+     // the BUILD_PAIR, only to have legalize rip it apart, just do it now.
+     if (N1.getOpcode() == ISD::BUILD_PAIR)
+       return N1.getOperand(N2C->getZExtValue());
+ 
+     // EXTRACT_ELEMENT of a constant int is also very common.
+     if (N1C) {
+       unsigned ElementSize = VT.getSizeInBits();
+       unsigned Shift = ElementSize * N2C->getZExtValue();
+       APInt ShiftedVal = N1C->getAPIntValue().lshr(Shift);
+       return getConstant(ShiftedVal.trunc(ElementSize), DL, VT);
+     }
+     break;
+   case ISD::EXTRACT_SUBVECTOR:
+     if (VT.isSimple() && N1.getValueType().isSimple()) {
+       assert(VT.isVector() && N1.getValueType().isVector() &&
+              "Extract subvector VTs must be a vectors!");
+       assert(VT.getVectorElementType() ==
+              N1.getValueType().getVectorElementType() &&
+              "Extract subvector VTs must have the same element type!");
+       assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
+              "Extract subvector must be from larger vector to smaller vector!");
+ 
+       if (N2C) {
+         assert((VT.getVectorNumElements() + N2C->getZExtValue()
+                 <= N1.getValueType().getVectorNumElements())
+                && "Extract subvector overflow!");
+       }
+ 
+       // Trivial extraction.
+       if (VT.getSimpleVT() == N1.getSimpleValueType())
+         return N1;
+ 
+       // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
+       if (N1.isUndef())
+         return getUNDEF(VT);
+ 
+       // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
+       // the concat have the same type as the extract.
+       if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
+           N1.getNumOperands() > 0 &&
+           VT == N1.getOperand(0).getValueType()) {
+         unsigned Factor = VT.getVectorNumElements();
+         return N1.getOperand(N2C->getZExtValue() / Factor);
+       }
+ 
+       // EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created
+       // during shuffle legalization.
+       if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) &&
+           VT == N1.getOperand(1).getValueType())
+         return N1.getOperand(1);
+     }
+     break;
+   }
+ 
+   // Perform trivial constant folding.
+   if (SDValue SV =
+           FoldConstantArithmetic(Opcode, DL, VT, N1.getNode(), N2.getNode()))
+     return SV;
+ 
+   // Constant fold FP operations.
+   bool HasFPExceptions = TLI->hasFloatingPointExceptions();
+   if (N1CFP) {
+     if (N2CFP) {
+       APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF();
+       APFloat::opStatus s;
+       switch (Opcode) {
+       case ISD::FADD:
+         s = V1.add(V2, APFloat::rmNearestTiesToEven);
+         if (!HasFPExceptions || s != APFloat::opInvalidOp)
+           return getConstantFP(V1, DL, VT);
+         break;
+       case ISD::FSUB:
+         s = V1.subtract(V2, APFloat::rmNearestTiesToEven);
+         if (!HasFPExceptions || s!=APFloat::opInvalidOp)
+           return getConstantFP(V1, DL, VT);
+         break;
+       case ISD::FMUL:
+         s = V1.multiply(V2, APFloat::rmNearestTiesToEven);
+         if (!HasFPExceptions || s!=APFloat::opInvalidOp)
+           return getConstantFP(V1, DL, VT);
+         break;
+       case ISD::FDIV:
+         s = V1.divide(V2, APFloat::rmNearestTiesToEven);
+         if (!HasFPExceptions || (s!=APFloat::opInvalidOp &&
+                                  s!=APFloat::opDivByZero)) {
+           return getConstantFP(V1, DL, VT);
+         }
+         break;
+       case ISD::FREM :
+         s = V1.mod(V2);
+         if (!HasFPExceptions || (s!=APFloat::opInvalidOp &&
+                                  s!=APFloat::opDivByZero)) {
+           return getConstantFP(V1, DL, VT);
+         }
+         break;
+       case ISD::FCOPYSIGN:
+         V1.copySign(V2);
+         return getConstantFP(V1, DL, VT);
+       default: break;
+       }
+     }
+ 
+     if (Opcode == ISD::FP_ROUND) {
+       APFloat V = N1CFP->getValueAPF();    // make copy
+       bool ignored;
+       // This can return overflow, underflow, or inexact; we don't care.
+       // FIXME need to be more flexible about rounding mode.
+       (void)V.convert(EVTToAPFloatSemantics(VT),
+                       APFloat::rmNearestTiesToEven, &ignored);
+       return getConstantFP(V, DL, VT);
+     }
+   }
+ 
+   switch (Opcode) {
+   case ISD::FADD:
+   case ISD::FSUB:
+   case ISD::FMUL:
+   case ISD::FDIV:
+   case ISD::FREM:
+     // If both operands are undef, the result is undef. If 1 operand is undef,
+     // the result is NaN. This should match the behavior of the IR optimizer.
+     if (N1.isUndef() && N2.isUndef())
+       return getUNDEF(VT);
+     if (N1.isUndef() || N2.isUndef())
+       return getConstantFP(APFloat::getNaN(EVTToAPFloatSemantics(VT)), DL, VT);
+   }
+ 
+   // Canonicalize an UNDEF to the RHS, even over a constant.
+   if (N1.isUndef()) {
+     if (TLI->isCommutativeBinOp(Opcode)) {
+       std::swap(N1, N2);
+     } else {
+       switch (Opcode) {
+       case ISD::FP_ROUND_INREG:
+       case ISD::SIGN_EXTEND_INREG:
+       case ISD::SUB:
+         return getUNDEF(VT);     // fold op(undef, arg2) -> undef
+       case ISD::UDIV:
+       case ISD::SDIV:
+       case ISD::UREM:
+       case ISD::SREM:
+         return getConstant(0, DL, VT);    // fold op(undef, arg2) -> 0
+       }
+     }
+   }
+ 
+   // Fold a bunch of operators when the RHS is undef.
+   if (N2.isUndef()) {
+     switch (Opcode) {
+     case ISD::XOR:
+       if (N1.isUndef())
+         // Handle undef ^ undef -> 0 special case. This is a common
+         // idiom (misuse).
+         return getConstant(0, DL, VT);
+       LLVM_FALLTHROUGH;
+     case ISD::ADD:
+     case ISD::SUB:
+     case ISD::UDIV:
+     case ISD::SDIV:
+     case ISD::UREM:
+     case ISD::SREM:
+       return getUNDEF(VT);       // fold op(arg1, undef) -> undef
+     case ISD::MUL:
+     case ISD::AND:
+       return getConstant(0, DL, VT);  // fold op(arg1, undef) -> 0
+     case ISD::OR:
+       return getAllOnesConstant(DL, VT);
+     }
+   }
+ 
+   // Memoize this node if possible.
+   SDNode *N;
+   SDVTList VTs = getVTList(VT);
+   SDValue Ops[] = {N1, N2};
+   if (VT != MVT::Glue) {
+     FoldingSetNodeID ID;
+     AddNodeIDNode(ID, Opcode, VTs, Ops);
+     void *IP = nullptr;
+     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+       E->intersectFlagsWith(Flags);
+       return SDValue(E, 0);
+     }
+ 
+     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+     N->setFlags(Flags);
+     createOperands(N, Ops);
+     CSEMap.InsertNode(N, IP);
+   } else {
+     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+     createOperands(N, Ops);
+   }
+ 
+   InsertNode(N);
+   SDValue V = SDValue(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                               SDValue N1, SDValue N2, SDValue N3,
+                               const SDNodeFlags Flags) {
+   // Perform various simplifications.
+   switch (Opcode) {
+   case ISD::FMA: {
+     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
+     assert(N1.getValueType() == VT && N2.getValueType() == VT &&
+            N3.getValueType() == VT && "FMA types must match!");
+     ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+     ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
+     ConstantFPSDNode *N3CFP = dyn_cast<ConstantFPSDNode>(N3);
+     if (N1CFP && N2CFP && N3CFP) {
+       APFloat  V1 = N1CFP->getValueAPF();
+       const APFloat &V2 = N2CFP->getValueAPF();
+       const APFloat &V3 = N3CFP->getValueAPF();
+       APFloat::opStatus s =
+         V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
+       if (!TLI->hasFloatingPointExceptions() || s != APFloat::opInvalidOp)
+         return getConstantFP(V1, DL, VT);
+     }
+     break;
+   }
+   case ISD::BUILD_VECTOR: {
+     // Attempt to simplify BUILD_VECTOR.
+     SDValue Ops[] = {N1, N2, N3};
+     if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+       return V;
+     break;
+   }
+   case ISD::CONCAT_VECTORS: {
+     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+     SDValue Ops[] = {N1, N2, N3};
+     if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+       return V;
+     break;
+   }
+   case ISD::SETCC: {
+     assert(VT.isInteger() && "SETCC result type must be an integer!");
+     assert(N1.getValueType() == N2.getValueType() &&
+            "SETCC operands must have the same type!");
+     assert(VT.isVector() == N1.getValueType().isVector() &&
+            "SETCC type should be vector iff the operand type is vector!");
+     assert((!VT.isVector() ||
+             VT.getVectorNumElements() == N1.getValueType().getVectorNumElements()) &&
+            "SETCC vector element counts must match!");
+     // Use FoldSetCC to simplify SETCC's.
+     if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
+       return V;
+     // Vector constant folding.
+     SDValue Ops[] = {N1, N2, N3};
+     if (SDValue V = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops)) {
+       NewSDValueDbgMsg(V, "New node vector constant folding: ", this);
+       return V;
+     }
+     break;
+   }
+   case ISD::SELECT:
+   case ISD::VSELECT:
+     if (SDValue V = simplifySelect(N1, N2, N3))
+       return V;
+     break;
+   case ISD::VECTOR_SHUFFLE:
+     llvm_unreachable("should use getVectorShuffle constructor!");
+   case ISD::INSERT_VECTOR_ELT: {
+     ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3);
+     // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF
+     if (N3C && N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
+       return getUNDEF(VT);
+     break;
+   }
+   case ISD::INSERT_SUBVECTOR: {
+     SDValue Index = N3;
+     if (VT.isSimple() && N1.getValueType().isSimple()
+         && N2.getValueType().isSimple()) {
+       assert(VT.isVector() && N1.getValueType().isVector() &&
+              N2.getValueType().isVector() &&
+              "Insert subvector VTs must be a vectors");
+       assert(VT == N1.getValueType() &&
+              "Dest and insert subvector source types must match!");
+       assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
+              "Insert subvector must be from smaller vector to larger vector!");
+       if (isa<ConstantSDNode>(Index)) {
+         assert((N2.getValueType().getVectorNumElements() +
+                 cast<ConstantSDNode>(Index)->getZExtValue()
+                 <= VT.getVectorNumElements())
+                && "Insert subvector overflow!");
+       }
+ 
+       // Trivial insertion.
+       if (VT.getSimpleVT() == N2.getSimpleValueType())
+         return N2;
+     }
+     break;
+   }
+   case ISD::BITCAST:
+     // Fold bit_convert nodes from a type to themselves.
+     if (N1.getValueType() == VT)
+       return N1;
+     break;
+   }
+ 
+   // Memoize node if it doesn't produce a flag.
+   SDNode *N;
+   SDVTList VTs = getVTList(VT);
+   SDValue Ops[] = {N1, N2, N3};
+   if (VT != MVT::Glue) {
+     FoldingSetNodeID ID;
+     AddNodeIDNode(ID, Opcode, VTs, Ops);
+     void *IP = nullptr;
+     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+       E->intersectFlagsWith(Flags);
+       return SDValue(E, 0);
+     }
+ 
+     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+     N->setFlags(Flags);
+     createOperands(N, Ops);
+     CSEMap.InsertNode(N, IP);
+   } else {
+     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+     createOperands(N, Ops);
+   }
+ 
+   InsertNode(N);
+   SDValue V = SDValue(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                               SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
+   SDValue Ops[] = { N1, N2, N3, N4 };
+   return getNode(Opcode, DL, VT, Ops);
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                               SDValue N1, SDValue N2, SDValue N3, SDValue N4,
+                               SDValue N5) {
+   SDValue Ops[] = { N1, N2, N3, N4, N5 };
+   return getNode(Opcode, DL, VT, Ops);
+ }
+ 
+ /// getStackArgumentTokenFactor - Compute a TokenFactor to force all
+ /// the incoming stack arguments to be loaded from the stack.
+ SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
+   SmallVector<SDValue, 8> ArgChains;
+ 
+   // Include the original chain at the beginning of the list. When this is
+   // used by target LowerCall hooks, this helps legalize find the
+   // CALLSEQ_BEGIN node.
+   ArgChains.push_back(Chain);
+ 
+   // Add a chain value for each stack argument.
+   for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(),
+        UE = getEntryNode().getNode()->use_end(); U != UE; ++U)
+     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+         if (FI->getIndex() < 0)
+           ArgChains.push_back(SDValue(L, 1));
+ 
+   // Build a tokenfactor for all the chains.
+   return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+ }
+ 
+ /// getMemsetValue - Vectorized representation of the memset value
+ /// operand.
+ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
+                               const SDLoc &dl) {
+   assert(!Value.isUndef());
+ 
+   unsigned NumBits = VT.getScalarSizeInBits();
+   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
+     assert(C->getAPIntValue().getBitWidth() == 8);
+     APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
+     if (VT.isInteger()) {
+       bool IsOpaque = VT.getSizeInBits() > 64 ||
+           !DAG.getTargetLoweringInfo().isLegalStoreImmediate(C->getSExtValue());
+       return DAG.getConstant(Val, dl, VT, false, IsOpaque);
+     }
+     return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), dl,
+                              VT);
+   }
+ 
+   assert(Value.getValueType() == MVT::i8 && "memset with non-byte fill value?");
+   EVT IntVT = VT.getScalarType();
+   if (!IntVT.isInteger())
+     IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());
+ 
+   Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
+   if (NumBits > 8) {
+     // Use a multiplication with 0x010101... to extend the input to the
+     // required length.
+     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
+     Value = DAG.getNode(ISD::MUL, dl, IntVT, Value,
+                         DAG.getConstant(Magic, dl, IntVT));
+   }
+ 
+   if (VT != Value.getValueType() && !VT.isInteger())
+     Value = DAG.getBitcast(VT.getScalarType(), Value);
+   if (VT != Value.getValueType())
+     Value = DAG.getSplatBuildVector(VT, dl, Value);
+ 
+   return Value;
+ }
+ 
+ /// getMemsetStringVal - Similar to getMemsetValue. Except this is only
+ /// used when a memcpy is turned into a memset when the source is a constant
+ /// string ptr.
+ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
+                                   const TargetLowering &TLI,
+                                   const ConstantDataArraySlice &Slice) {
+   // Handle vector with all elements zero.
+   if (Slice.Array == nullptr) {
+     if (VT.isInteger())
+       return DAG.getConstant(0, dl, VT);
+     else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
+       return DAG.getConstantFP(0.0, dl, VT);
+     else if (VT.isVector()) {
+       unsigned NumElts = VT.getVectorNumElements();
+       MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+       return DAG.getNode(ISD::BITCAST, dl, VT,
+                          DAG.getConstant(0, dl,
+                                          EVT::getVectorVT(*DAG.getContext(),
+                                                           EltVT, NumElts)));
+     } else
+       llvm_unreachable("Expected type!");
+   }
+ 
+   assert(!VT.isVector() && "Can't handle vector type here!");
+   unsigned NumVTBits = VT.getSizeInBits();
+   unsigned NumVTBytes = NumVTBits / 8;
+   unsigned NumBytes = std::min(NumVTBytes, unsigned(Slice.Length));
+ 
+   APInt Val(NumVTBits, 0);
+   if (DAG.getDataLayout().isLittleEndian()) {
+     for (unsigned i = 0; i != NumBytes; ++i)
+       Val |= (uint64_t)(unsigned char)Slice[i] << i*8;
+   } else {
+     for (unsigned i = 0; i != NumBytes; ++i)
+       Val |= (uint64_t)(unsigned char)Slice[i] << (NumVTBytes-i-1)*8;
+   }
+ 
+   // If the "cost" of materializing the integer immediate is less than the cost
+   // of a load, then it is cost effective to turn the load into the immediate.
+   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+   if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty))
+     return DAG.getConstant(Val, dl, VT);
+   return SDValue(nullptr, 0);
+ }
+ 
+ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset,
+                                            const SDLoc &DL) {
+   EVT VT = Base.getValueType();
+   return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT));
+ }
+ 
+ /// Returns true if memcpy source is constant data.
+ static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
+   uint64_t SrcDelta = 0;
+   GlobalAddressSDNode *G = nullptr;
+   if (Src.getOpcode() == ISD::GlobalAddress)
+     G = cast<GlobalAddressSDNode>(Src);
+   else if (Src.getOpcode() == ISD::ADD &&
+            Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
+            Src.getOperand(1).getOpcode() == ISD::Constant) {
+     G = cast<GlobalAddressSDNode>(Src.getOperand(0));
+     SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
+   }
+   if (!G)
+     return false;
+ 
+   return getConstantDataArrayInfo(G->getGlobal(), Slice, 8,
+                                   SrcDelta + G->getOffset());
+ }
+ 
+ /// Determines the optimal series of memory ops to replace the memset / memcpy.
+ /// Return true if the number of memory ops is below the threshold (Limit).
+ /// It returns the types of the sequence of memory ops to perform
+ /// memset / memcpy by reference.
+ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
+                                      unsigned Limit, uint64_t Size,
+                                      unsigned DstAlign, unsigned SrcAlign,
+                                      bool IsMemset,
+                                      bool ZeroMemset,
+                                      bool MemcpyStrSrc,
+                                      bool AllowOverlap,
+                                      unsigned DstAS, unsigned SrcAS,
+                                      SelectionDAG &DAG,
+                                      const TargetLowering &TLI) {
+   assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
+          "Expecting memcpy / memset source to meet alignment requirement!");
+   // If 'SrcAlign' is zero, that means the memory operation does not need to
+   // load the value, i.e. memset or memcpy from constant string. Otherwise,
+   // it's the inferred alignment of the source. 'DstAlign', on the other hand,
+   // is the specified alignment of the memory operation. If it is zero, that
+   // means it's possible to change the alignment of the destination.
+   // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
+   // not need to be loaded.
+   EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign,
+                                    IsMemset, ZeroMemset, MemcpyStrSrc,
+                                    DAG.getMachineFunction());
+ 
+   if (VT == MVT::Other) {
+     // Use the largest integer type whose alignment constraints are satisfied.
+     // We only need to check DstAlign here as SrcAlign is always greater or
+     // equal to DstAlign (or zero).
+     VT = MVT::i64;
+     while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
+            !TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
+       VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
+     assert(VT.isInteger());
+ 
+     // Find the largest legal integer type.
+     MVT LVT = MVT::i64;
+     while (!TLI.isTypeLegal(LVT))
+       LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
+     assert(LVT.isInteger());
+ 
+     // If the type we've chosen is larger than the largest legal integer type
+     // then use that instead.
+     if (VT.bitsGT(LVT))
+       VT = LVT;
+   }
+ 
+   unsigned NumMemOps = 0;
+   while (Size != 0) {
+     unsigned VTSize = VT.getSizeInBits() / 8;
+     while (VTSize > Size) {
+       // For now, only use non-vector load / store's for the left-over pieces.
+       EVT NewVT = VT;
+       unsigned NewVTSize;
+ 
+       bool Found = false;
+       if (VT.isVector() || VT.isFloatingPoint()) {
+         NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
+         if (TLI.isOperationLegalOrCustom(ISD::STORE, NewVT) &&
+             TLI.isSafeMemOpType(NewVT.getSimpleVT()))
+           Found = true;
+         else if (NewVT == MVT::i64 &&
+                  TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64) &&
+                  TLI.isSafeMemOpType(MVT::f64)) {
+           // i64 is usually not legal on 32-bit targets, but f64 may be.
+           NewVT = MVT::f64;
+           Found = true;
+         }
+       }
+ 
+       if (!Found) {
+         do {
+           NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
+           if (NewVT == MVT::i8)
+             break;
+         } while (!TLI.isSafeMemOpType(NewVT.getSimpleVT()));
+       }
+       NewVTSize = NewVT.getSizeInBits() / 8;
+ 
+       // If the new VT cannot cover all of the remaining bits, then consider
+       // issuing a (or a pair of) unaligned and overlapping load / store.
+       bool Fast;
+       if (NumMemOps && AllowOverlap && NewVTSize < Size &&
+           TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) &&
+           Fast)
+         VTSize = Size;
+       else {
+         VT = NewVT;
+         VTSize = NewVTSize;
+       }
+     }
+ 
+     if (++NumMemOps > Limit)
+       return false;
+ 
+     MemOps.push_back(VT);
+     Size -= VTSize;
+   }
+ 
+   return true;
+ }
+ 
+ static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
+   // On Darwin, -Os means optimize for size without hurting performance, so
+   // only really optimize for size when -Oz (MinSize) is used.
+   if (MF.getTarget().getTargetTriple().isOSDarwin())
+     return MF.getFunction().optForMinSize();
+   return MF.getFunction().optForSize();
+ }
+ 
+ static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                           SmallVector<SDValue, 32> &OutChains, unsigned From,
+                           unsigned To, SmallVector<SDValue, 16> &OutLoadChains,
+                           SmallVector<SDValue, 16> &OutStoreChains) {
+   assert(OutLoadChains.size() && "Missing loads in memcpy inlining");
+   assert(OutStoreChains.size() && "Missing stores in memcpy inlining");
+   SmallVector<SDValue, 16> GluedLoadChains;
+   for (unsigned i = From; i < To; ++i) {
+     OutChains.push_back(OutLoadChains[i]);
+     GluedLoadChains.push_back(OutLoadChains[i]);
+   }
+ 
+   // Chain for all loads.
+   SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                   GluedLoadChains);
+ 
+   for (unsigned i = From; i < To; ++i) {
+     StoreSDNode *ST = dyn_cast<StoreSDNode>(OutStoreChains[i]);
+     SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(),
+                                   ST->getBasePtr(), ST->getMemoryVT(),
+                                   ST->getMemOperand());
+     OutChains.push_back(NewStore);
+   }
+ }
+ 
+ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
+                                        SDValue Chain, SDValue Dst, SDValue Src,
+                                        uint64_t Size, unsigned Align,
+                                        bool isVol, bool AlwaysInline,
+                                        MachinePointerInfo DstPtrInfo,
+                                        MachinePointerInfo SrcPtrInfo) {
+   // Turn a memcpy of undef to nop.
+   if (Src.isUndef())
+     return Chain;
+ 
+   // Expand memcpy to a series of load and store ops if the size operand falls
+   // below a certain threshold.
+   // TODO: In the AlwaysInline case, if the size is big then generate a loop
+   // rather than maybe a humongous number of loads and stores.
+   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+   const DataLayout &DL = DAG.getDataLayout();
+   LLVMContext &C = *DAG.getContext();
+   std::vector<EVT> MemOps;
+   bool DstAlignCanChange = false;
+   MachineFunction &MF = DAG.getMachineFunction();
+   MachineFrameInfo &MFI = MF.getFrameInfo();
+   bool OptSize = shouldLowerMemFuncForSize(MF);
+   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
+     DstAlignCanChange = true;
+   unsigned SrcAlign = DAG.InferPtrAlignment(Src);
+   if (Align > SrcAlign)
+     SrcAlign = Align;
+   ConstantDataArraySlice Slice;
+   bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
+   bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
+   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
+ 
+   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
+                                 (DstAlignCanChange ? 0 : Align),
+                                 (isZeroConstant ? 0 : SrcAlign),
+                                 false, false, CopyFromConstant, true,
+                                 DstPtrInfo.getAddrSpace(),
+                                 SrcPtrInfo.getAddrSpace(),
+                                 DAG, TLI))
+     return SDValue();
+ 
+   if (DstAlignCanChange) {
+     Type *Ty = MemOps[0].getTypeForEVT(C);
+     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
+ 
+     // Don't promote to an alignment that would require dynamic stack
+     // realignment.
+     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+     if (!TRI->needsStackRealignment(MF))
+       while (NewAlign > Align &&
+              DL.exceedsNaturalStackAlignment(NewAlign))
+           NewAlign /= 2;
+ 
+     if (NewAlign > Align) {
+       // Give the stack frame object a larger alignment if needed.
+       if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
+         MFI.setObjectAlignment(FI->getIndex(), NewAlign);
+       Align = NewAlign;
+     }
+   }
+ 
+   MachineMemOperand::Flags MMOFlags =
+       isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+   SmallVector<SDValue, 16> OutLoadChains;
+   SmallVector<SDValue, 16> OutStoreChains;
+   SmallVector<SDValue, 32> OutChains;
+   unsigned NumMemOps = MemOps.size();
+   uint64_t SrcOff = 0, DstOff = 0;
+   for (unsigned i = 0; i != NumMemOps; ++i) {
+     EVT VT = MemOps[i];
+     unsigned VTSize = VT.getSizeInBits() / 8;
+     SDValue Value, Store;
+ 
+     if (VTSize > Size) {
+       // Issuing an unaligned load / store pair  that overlaps with the previous
+       // pair. Adjust the offset accordingly.
+       assert(i == NumMemOps-1 && i != 0);
+       SrcOff -= VTSize - Size;
+       DstOff -= VTSize - Size;
+     }
+ 
+     if (CopyFromConstant &&
+         (isZeroConstant || (VT.isInteger() && !VT.isVector()))) {
+       // It's unlikely a store of a vector immediate can be done in a single
+       // instruction. It would require a load from a constantpool first.
+       // We only handle zero vectors here.
+       // FIXME: Handle other cases where store of vector immediate is done in
+       // a single instruction.
+       ConstantDataArraySlice SubSlice;
+       if (SrcOff < Slice.Length) {
+         SubSlice = Slice;
+         SubSlice.move(SrcOff);
+       } else {
+         // This is an out-of-bounds access and hence UB. Pretend we read zero.
+         SubSlice.Array = nullptr;
+         SubSlice.Offset = 0;
+         SubSlice.Length = VTSize;
+       }
+       Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
+       if (Value.getNode()) {
+         Store = DAG.getStore(Chain, dl, Value,
+                              DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+                              DstPtrInfo.getWithOffset(DstOff), Align,
+                              MMOFlags);
+         OutChains.push_back(Store);
+       }
+     }
+ 
+     if (!Store.getNode()) {
+       // The type might not be legal for the target.  This should only happen
+       // if the type is smaller than a legal type, as on PPC, so the right
+       // thing to do is generate a LoadExt/StoreTrunc pair.  These simplify
+       // to Load/Store if NVT==VT.
+       // FIXME does the case above also need this?
+       EVT NVT = TLI.getTypeToTransformTo(C, VT);
+       assert(NVT.bitsGE(VT));
+ 
+       bool isDereferenceable =
+         SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+       MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
+       if (isDereferenceable)
+         SrcMMOFlags |= MachineMemOperand::MODereferenceable;
+ 
+       Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
+                              DAG.getMemBasePlusOffset(Src, SrcOff, dl),
+                              SrcPtrInfo.getWithOffset(SrcOff), VT,
+                              MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
+       OutLoadChains.push_back(Value.getValue(1));
+ 
+       Store = DAG.getTruncStore(
+           Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+           DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
+       OutStoreChains.push_back(Store);
+     }
+     SrcOff += VTSize;
+     DstOff += VTSize;
+     Size -= VTSize;
+   }
+ 
+   unsigned GluedLdStLimit = MaxLdStGlue == 0 ?
+                                 TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue;
+   unsigned NumLdStInMemcpy = OutStoreChains.size();
+ 
+   if (NumLdStInMemcpy) {
+     // It may be that memcpy might be converted to memset if it's memcpy
+     // of constants. In such a case, we won't have loads and stores, but
+     // just stores. In the absence of loads, there is nothing to gang up.
+     if ((GluedLdStLimit <= 1) || !EnableMemCpyDAGOpt) {
+       // If target does not care, just leave as it.
+       for (unsigned i = 0; i < NumLdStInMemcpy; ++i) {
+         OutChains.push_back(OutLoadChains[i]);
+         OutChains.push_back(OutStoreChains[i]);
+       }
+     } else {
+       // Ld/St less than/equal limit set by target.
+       if (NumLdStInMemcpy <= GluedLdStLimit) {
+           chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                         NumLdStInMemcpy, OutLoadChains,
+                                         OutStoreChains);
+       } else {
+         unsigned NumberLdChain =  NumLdStInMemcpy / GluedLdStLimit;
+         unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit;
+         unsigned GlueIter = 0;
+ 
+         for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
+           unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
+           unsigned IndexTo   = NumLdStInMemcpy - GlueIter;
+ 
+           chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
+                                        OutLoadChains, OutStoreChains);
+           GlueIter += GluedLdStLimit;
+         }
+ 
+         // Residual ld/st.
+         if (RemainingLdStInMemcpy) {
+           chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                         RemainingLdStInMemcpy, OutLoadChains,
+                                         OutStoreChains);
+         }
+       }
+     }
+   }
+   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ }
+ 
+ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
+                                         SDValue Chain, SDValue Dst, SDValue Src,
+                                         uint64_t Size, unsigned Align,
+                                         bool isVol, bool AlwaysInline,
+                                         MachinePointerInfo DstPtrInfo,
+                                         MachinePointerInfo SrcPtrInfo) {
+   // Turn a memmove of undef to nop.
+   if (Src.isUndef())
+     return Chain;
+ 
+   // Expand memmove to a series of load and store ops if the size operand falls
+   // below a certain threshold.
+   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+   const DataLayout &DL = DAG.getDataLayout();
+   LLVMContext &C = *DAG.getContext();
+   std::vector<EVT> MemOps;
+   bool DstAlignCanChange = false;
+   MachineFunction &MF = DAG.getMachineFunction();
+   MachineFrameInfo &MFI = MF.getFrameInfo();
+   bool OptSize = shouldLowerMemFuncForSize(MF);
+   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
+     DstAlignCanChange = true;
+   unsigned SrcAlign = DAG.InferPtrAlignment(Src);
+   if (Align > SrcAlign)
+     SrcAlign = Align;
+   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);
+ 
+   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
+                                 (DstAlignCanChange ? 0 : Align), SrcAlign,
+                                 false, false, false, false,
+                                 DstPtrInfo.getAddrSpace(),
+                                 SrcPtrInfo.getAddrSpace(),
+                                 DAG, TLI))
+     return SDValue();
+ 
+   if (DstAlignCanChange) {
+     Type *Ty = MemOps[0].getTypeForEVT(C);
+     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
+     if (NewAlign > Align) {
+       // Give the stack frame object a larger alignment if needed.
+       if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
+         MFI.setObjectAlignment(FI->getIndex(), NewAlign);
+       Align = NewAlign;
+     }
+   }
+ 
+   MachineMemOperand::Flags MMOFlags =
+       isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+   uint64_t SrcOff = 0, DstOff = 0;
+   SmallVector<SDValue, 8> LoadValues;
+   SmallVector<SDValue, 8> LoadChains;
+   SmallVector<SDValue, 8> OutChains;
+   unsigned NumMemOps = MemOps.size();
+   for (unsigned i = 0; i < NumMemOps; i++) {
+     EVT VT = MemOps[i];
+     unsigned VTSize = VT.getSizeInBits() / 8;
+     SDValue Value;
+ 
+     bool isDereferenceable =
+       SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+     MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
+     if (isDereferenceable)
+       SrcMMOFlags |= MachineMemOperand::MODereferenceable;
+ 
+     Value =
+         DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
+                     SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, SrcMMOFlags);
+     LoadValues.push_back(Value);
+     LoadChains.push_back(Value.getValue(1));
+     SrcOff += VTSize;
+   }
+   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+   OutChains.clear();
+   for (unsigned i = 0; i < NumMemOps; i++) {
+     EVT VT = MemOps[i];
+     unsigned VTSize = VT.getSizeInBits() / 8;
+     SDValue Store;
+ 
+     Store = DAG.getStore(Chain, dl, LoadValues[i],
+                          DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+                          DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
+     OutChains.push_back(Store);
+     DstOff += VTSize;
+   }
+ 
+   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ }
+ 
+ /// Lower the call to 'memset' intrinsic function into a series of store
+ /// operations.
+ ///
+ /// \param DAG Selection DAG where lowered code is placed.
+ /// \param dl Link to corresponding IR location.
+ /// \param Chain Control flow dependency.
+ /// \param Dst Pointer to destination memory location.
+ /// \param Src Value of byte to write into the memory.
+ /// \param Size Number of bytes to write.
+ /// \param Align Alignment of the destination in bytes.
+ /// \param isVol True if destination is volatile.
+ /// \param DstPtrInfo IR information on the memory pointer.
+ /// \returns New head in the control flow, if lowering was successful, empty
+ /// SDValue otherwise.
+ ///
+ /// The function tries to replace 'llvm.memset' intrinsic with several store
+ /// operations and value calculation code. This is usually profitable for small
+ /// memory size.
+ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
+                                SDValue Chain, SDValue Dst, SDValue Src,
+                                uint64_t Size, unsigned Align, bool isVol,
+                                MachinePointerInfo DstPtrInfo) {
+   // Turn a memset of undef to nop.
+   if (Src.isUndef())
+     return Chain;
+ 
+   // Expand memset to a series of load/store ops if the size operand
+   // falls below a certain threshold.
+   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+   std::vector<EVT> MemOps;
+   bool DstAlignCanChange = false;
+   MachineFunction &MF = DAG.getMachineFunction();
+   MachineFrameInfo &MFI = MF.getFrameInfo();
+   bool OptSize = shouldLowerMemFuncForSize(MF);
+   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
+     DstAlignCanChange = true;
+   bool IsZeroVal =
+     isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
+   if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
+                                 Size, (DstAlignCanChange ? 0 : Align), 0,
+                                 true, IsZeroVal, false, true,
+                                 DstPtrInfo.getAddrSpace(), ~0u,
+                                 DAG, TLI))
+     return SDValue();
+ 
+   if (DstAlignCanChange) {
+     Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+     unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
+     if (NewAlign > Align) {
+       // Give the stack frame object a larger alignment if needed.
+       if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
+         MFI.setObjectAlignment(FI->getIndex(), NewAlign);
+       Align = NewAlign;
+     }
+   }
+ 
+   SmallVector<SDValue, 8> OutChains;
+   uint64_t DstOff = 0;
+   unsigned NumMemOps = MemOps.size();
+ 
+   // Find the largest store and generate the bit pattern for it.
+   EVT LargestVT = MemOps[0];
+   for (unsigned i = 1; i < NumMemOps; i++)
+     if (MemOps[i].bitsGT(LargestVT))
+       LargestVT = MemOps[i];
+   SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl);
+ 
+   for (unsigned i = 0; i < NumMemOps; i++) {
+     EVT VT = MemOps[i];
+     unsigned VTSize = VT.getSizeInBits() / 8;
+     if (VTSize > Size) {
+       // Issuing an unaligned load / store pair  that overlaps with the previous
+       // pair. Adjust the offset accordingly.
+       assert(i == NumMemOps-1 && i != 0);
+       DstOff -= VTSize - Size;
+     }
+ 
+     // If this store is smaller than the largest store see whether we can get
+     // the smaller value for free with a truncate.
+     SDValue Value = MemSetValue;
+     if (VT.bitsLT(LargestVT)) {
+       if (!LargestVT.isVector() && !VT.isVector() &&
+           TLI.isTruncateFree(LargestVT, VT))
+         Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
+       else
+         Value = getMemsetValue(Src, VT, DAG, dl);
+     }
+     assert(Value.getValueType() == VT && "Value with wrong type.");
+     SDValue Store = DAG.getStore(
+         Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+         DstPtrInfo.getWithOffset(DstOff), Align,
+         isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone);
+     OutChains.push_back(Store);
+     DstOff += VT.getSizeInBits() / 8;
+     Size -= VTSize;
+   }
+ 
+   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+ }
+ 
+ static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
+                                             unsigned AS) {
+   // Lowering memcpy / memset / memmove intrinsics to calls is only valid if all
+   // pointer operands can be losslessly bitcasted to pointers of address space 0
+   if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) {
+     report_fatal_error("cannot lower memory intrinsic in address space " +
+                        Twine(AS));
+   }
+ }
+ 
+ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                 SDValue Src, SDValue Size, unsigned Align,
+                                 bool isVol, bool AlwaysInline, bool isTailCall,
+                                 MachinePointerInfo DstPtrInfo,
+                                 MachinePointerInfo SrcPtrInfo) {
+   assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
+ 
+   // Check to see if we should lower the memcpy to loads and stores first.
+   // For cases within the target-specified limits, this is the best choice.
+   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+   if (ConstantSize) {
+     // Memcpy with size zero? Just return the original chain.
+     if (ConstantSize->isNullValue())
+       return Chain;
+ 
+     SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
+                                              ConstantSize->getZExtValue(),Align,
+                                 isVol, false, DstPtrInfo, SrcPtrInfo);
+     if (Result.getNode())
+       return Result;
+   }
+ 
+   // Then check to see if we should lower the memcpy with target-specific
+   // code. If the target chooses to do this, this is the next best.
+   if (TSI) {
+     SDValue Result = TSI->EmitTargetCodeForMemcpy(
+         *this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline,
+         DstPtrInfo, SrcPtrInfo);
+     if (Result.getNode())
+       return Result;
+   }
+ 
+   // If we really need inline code and the target declined to provide it,
+   // use a (potentially long) sequence of loads and stores.
+   if (AlwaysInline) {
+     assert(ConstantSize && "AlwaysInline requires a constant size!");
+     return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
+                                    ConstantSize->getZExtValue(), Align, isVol,
+                                    true, DstPtrInfo, SrcPtrInfo);
+   }
+ 
+   checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+   checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
+ 
+   // FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc
+   // memcpy is not guaranteed to be safe. libc memcpys aren't required to
+   // respect volatile, so they may do things like read or write memory
+   // beyond the given memory regions. But fixing this isn't easy, and most
+   // people don't care.
+ 
+   // Emit a library call.
+   TargetLowering::ArgListTy Args;
+   TargetLowering::ArgListEntry Entry;
+   Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+   Entry.Node = Dst; Args.push_back(Entry);
+   Entry.Node = Src; Args.push_back(Entry);
+   Entry.Node = Size; Args.push_back(Entry);
+   // FIXME: pass in SDLoc
+   TargetLowering::CallLoweringInfo CLI(*this);
+   CLI.setDebugLoc(dl)
+       .setChain(Chain)
+       .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+                     Dst.getValueType().getTypeForEVT(*getContext()),
+                     getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
+                                       TLI->getPointerTy(getDataLayout())),
+                     std::move(Args))
+       .setDiscardResult()
+       .setTailCall(isTailCall);
+ 
+   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+   return CallResult.second;
+ }
+ 
+ SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl,
+                                       SDValue Dst, unsigned DstAlign,
+                                       SDValue Src, unsigned SrcAlign,
+                                       SDValue Size, Type *SizeTy,
+                                       unsigned ElemSz, bool isTailCall,
+                                       MachinePointerInfo DstPtrInfo,
+                                       MachinePointerInfo SrcPtrInfo) {
+   // Emit a library call.
+   TargetLowering::ArgListTy Args;
+   TargetLowering::ArgListEntry Entry;
+   Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+   Entry.Node = Dst;
+   Args.push_back(Entry);
+ 
+   Entry.Node = Src;
+   Args.push_back(Entry);
+ 
+   Entry.Ty = SizeTy;
+   Entry.Node = Size;
+   Args.push_back(Entry);
+ 
+   RTLIB::Libcall LibraryCall =
+       RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElemSz);
+   if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+     report_fatal_error("Unsupported element size");
+ 
+   TargetLowering::CallLoweringInfo CLI(*this);
+   CLI.setDebugLoc(dl)
+       .setChain(Chain)
+       .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
+                     Type::getVoidTy(*getContext()),
+                     getExternalSymbol(TLI->getLibcallName(LibraryCall),
+                                       TLI->getPointerTy(getDataLayout())),
+                     std::move(Args))
+       .setDiscardResult()
+       .setTailCall(isTailCall);
+ 
+   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
+   return CallResult.second;
+ }
+ 
+ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                  SDValue Src, SDValue Size, unsigned Align,
+                                  bool isVol, bool isTailCall,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) {
+   assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
+ 
+   // Check to see if we should lower the memmove to loads and stores first.
+   // For cases within the target-specified limits, this is the best choice.
+   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+   if (ConstantSize) {
+     // Memmove with size zero? Just return the original chain.
+     if (ConstantSize->isNullValue())
+       return Chain;
+ 
+     SDValue Result =
+       getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src,
+                                ConstantSize->getZExtValue(), Align, isVol,
+                                false, DstPtrInfo, SrcPtrInfo);
+     if (Result.getNode())
+       return Result;
+   }
+ 
+   // Then check to see if we should lower the memmove with target-specific
+   // code. If the target chooses to do this, this is the next best.
+   if (TSI) {
+     SDValue Result = TSI->EmitTargetCodeForMemmove(
+         *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
+     if (Result.getNode())
+       return Result;
+   }
+ 
+   checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+   checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
+ 
+   // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
+   // not be safe.  See memcpy above for more details.
+ 
+   // Emit a library call.
+   TargetLowering::ArgListTy Args;
+   TargetLowering::ArgListEntry Entry;
+   Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+   Entry.Node = Dst; Args.push_back(Entry);
+   Entry.Node = Src; Args.push_back(Entry);
+   Entry.Node = Size; Args.push_back(Entry);
+   // FIXME:  pass in SDLoc
+   TargetLowering::CallLoweringInfo CLI(*this);
+   CLI.setDebugLoc(dl)
+       .setChain(Chain)
+       .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
+                     Dst.getValueType().getTypeForEVT(*getContext()),
+                     getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
+                                       TLI->getPointerTy(getDataLayout())),
+                     std::move(Args))
+       .setDiscardResult()
+       .setTailCall(isTailCall);
+ 
+   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+   return CallResult.second;
+ }
+ 
+ SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
+                                        SDValue Dst, unsigned DstAlign,
+                                        SDValue Src, unsigned SrcAlign,
+                                        SDValue Size, Type *SizeTy,
+                                        unsigned ElemSz, bool isTailCall,
+                                        MachinePointerInfo DstPtrInfo,
+                                        MachinePointerInfo SrcPtrInfo) {
+   // Emit a library call.
+   TargetLowering::ArgListTy Args;
+   TargetLowering::ArgListEntry Entry;
+   Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+   Entry.Node = Dst;
+   Args.push_back(Entry);
+ 
+   Entry.Node = Src;
+   Args.push_back(Entry);
+ 
+   Entry.Ty = SizeTy;
+   Entry.Node = Size;
+   Args.push_back(Entry);
+ 
+   RTLIB::Libcall LibraryCall =
+       RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElemSz);
+   if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+     report_fatal_error("Unsupported element size");
+ 
+   TargetLowering::CallLoweringInfo CLI(*this);
+   CLI.setDebugLoc(dl)
+       .setChain(Chain)
+       .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
+                     Type::getVoidTy(*getContext()),
+                     getExternalSymbol(TLI->getLibcallName(LibraryCall),
+                                       TLI->getPointerTy(getDataLayout())),
+                     std::move(Args))
+       .setDiscardResult()
+       .setTailCall(isTailCall);
+ 
+   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
+   return CallResult.second;
+ }
+ 
+ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                 SDValue Src, SDValue Size, unsigned Align,
+                                 bool isVol, bool isTailCall,
+                                 MachinePointerInfo DstPtrInfo) {
+   assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
+ 
+   // Check to see if we should lower the memset to stores first.
+   // For cases within the target-specified limits, this is the best choice.
+   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+   if (ConstantSize) {
+     // Memset with size zero? Just return the original chain.
+     if (ConstantSize->isNullValue())
+       return Chain;
+ 
+     SDValue Result =
+       getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
+                       Align, isVol, DstPtrInfo);
+ 
+     if (Result.getNode())
+       return Result;
+   }
+ 
+   // Then check to see if we should lower the memset with target-specific
+   // code. If the target chooses to do this, this is the next best.
+   if (TSI) {
+     SDValue Result = TSI->EmitTargetCodeForMemset(
+         *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo);
+     if (Result.getNode())
+       return Result;
+   }
+ 
+   checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+ 
+   // Emit a library call.
+   Type *IntPtrTy = getDataLayout().getIntPtrType(*getContext());
+   TargetLowering::ArgListTy Args;
+   TargetLowering::ArgListEntry Entry;
+   Entry.Node = Dst; Entry.Ty = IntPtrTy;
+   Args.push_back(Entry);
+   Entry.Node = Src;
+   Entry.Ty = Src.getValueType().getTypeForEVT(*getContext());
+   Args.push_back(Entry);
+   Entry.Node = Size;
+   Entry.Ty = IntPtrTy;
+   Args.push_back(Entry);
+ 
+   // FIXME: pass in SDLoc
+   TargetLowering::CallLoweringInfo CLI(*this);
+   CLI.setDebugLoc(dl)
+       .setChain(Chain)
+       .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
+                     Dst.getValueType().getTypeForEVT(*getContext()),
+                     getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
+                                       TLI->getPointerTy(getDataLayout())),
+                     std::move(Args))
+       .setDiscardResult()
+       .setTailCall(isTailCall);
+ 
+   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+   return CallResult.second;
+ }
+ 
+ SDValue SelectionDAG::getAtomicMemset(SDValue Chain, const SDLoc &dl,
+                                       SDValue Dst, unsigned DstAlign,
+                                       SDValue Value, SDValue Size, Type *SizeTy,
+                                       unsigned ElemSz, bool isTailCall,
+                                       MachinePointerInfo DstPtrInfo) {
+   // Emit a library call.
+   TargetLowering::ArgListTy Args;
+   TargetLowering::ArgListEntry Entry;
+   Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+   Entry.Node = Dst;
+   Args.push_back(Entry);
+ 
+   Entry.Ty = Type::getInt8Ty(*getContext());
+   Entry.Node = Value;
+   Args.push_back(Entry);
+ 
+   Entry.Ty = SizeTy;
+   Entry.Node = Size;
+   Args.push_back(Entry);
+ 
+   RTLIB::Libcall LibraryCall =
+       RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElemSz);
+   if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+     report_fatal_error("Unsupported element size");
+ 
+   TargetLowering::CallLoweringInfo CLI(*this);
+   CLI.setDebugLoc(dl)
+       .setChain(Chain)
+       .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
+                     Type::getVoidTy(*getContext()),
+                     getExternalSymbol(TLI->getLibcallName(LibraryCall),
+                                       TLI->getPointerTy(getDataLayout())),
+                     std::move(Args))
+       .setDiscardResult()
+       .setTailCall(isTailCall);
+ 
+   std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
+   return CallResult.second;
+ }
+ 
+ SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                 SDVTList VTList, ArrayRef<SDValue> Ops,
+                                 MachineMemOperand *MMO) {
+   FoldingSetNodeID ID;
+   ID.AddInteger(MemVT.getRawBits());
+   AddNodeIDNode(ID, Opcode, VTList, Ops);
+   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+   void* IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+     cast<AtomicSDNode>(E)->refineAlignment(MMO);
+     return SDValue(E, 0);
+   }
+ 
+   auto *N = newSDNode<AtomicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                     VTList, MemVT, MMO);
+   createOperands(N, Ops);
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ SDValue SelectionDAG::getAtomicCmpSwap(
+     unsigned Opcode, const SDLoc &dl, EVT MemVT, SDVTList VTs, SDValue Chain,
+     SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo,
+     unsigned Alignment, AtomicOrdering SuccessOrdering,
+     AtomicOrdering FailureOrdering, SyncScope::ID SSID) {
+   assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
+          Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+   assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
+ 
+   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+     Alignment = getEVTAlignment(MemVT);
+ 
+   MachineFunction &MF = getMachineFunction();
+ 
+   // FIXME: Volatile isn't really correct; we should keep track of atomic
+   // orderings in the memoperand.
+   auto Flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad |
+                MachineMemOperand::MOStore;
+   MachineMemOperand *MMO =
+     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment,
+                             AAMDNodes(), nullptr, SSID, SuccessOrdering,
+                             FailureOrdering);
+ 
+   return getAtomicCmpSwap(Opcode, dl, MemVT, VTs, Chain, Ptr, Cmp, Swp, MMO);
+ }
+ 
+ SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl,
+                                        EVT MemVT, SDVTList VTs, SDValue Chain,
+                                        SDValue Ptr, SDValue Cmp, SDValue Swp,
+                                        MachineMemOperand *MMO) {
+   assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
+          Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+   assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
+ 
+   SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
+   return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
+ }
+ 
+ SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                 SDValue Chain, SDValue Ptr, SDValue Val,
+                                 const Value *PtrVal, unsigned Alignment,
+                                 AtomicOrdering Ordering,
+                                 SyncScope::ID SSID) {
+   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+     Alignment = getEVTAlignment(MemVT);
+ 
+   MachineFunction &MF = getMachineFunction();
+   // An atomic store does not load. An atomic load does not store.
+   // (An atomicrmw obviously both loads and stores.)
+   // For now, atomics are considered to be volatile always, and they are
+   // chained as such.
+   // FIXME: Volatile isn't really correct; we should keep track of atomic
+   // orderings in the memoperand.
+   auto Flags = MachineMemOperand::MOVolatile;
+   if (Opcode != ISD::ATOMIC_STORE)
+     Flags |= MachineMemOperand::MOLoad;
+   if (Opcode != ISD::ATOMIC_LOAD)
+     Flags |= MachineMemOperand::MOStore;
+ 
+   MachineMemOperand *MMO =
+     MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
+                             MemVT.getStoreSize(), Alignment, AAMDNodes(),
+                             nullptr, SSID, Ordering);
+ 
+   return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO);
+ }
+ 
+ SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                 SDValue Chain, SDValue Ptr, SDValue Val,
+                                 MachineMemOperand *MMO) {
+   assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
+           Opcode == ISD::ATOMIC_LOAD_SUB ||
+           Opcode == ISD::ATOMIC_LOAD_AND ||
+           Opcode == ISD::ATOMIC_LOAD_CLR ||
+           Opcode == ISD::ATOMIC_LOAD_OR ||
+           Opcode == ISD::ATOMIC_LOAD_XOR ||
+           Opcode == ISD::ATOMIC_LOAD_NAND ||
+           Opcode == ISD::ATOMIC_LOAD_MIN ||
+           Opcode == ISD::ATOMIC_LOAD_MAX ||
+           Opcode == ISD::ATOMIC_LOAD_UMIN ||
+           Opcode == ISD::ATOMIC_LOAD_UMAX ||
+           Opcode == ISD::ATOMIC_SWAP ||
+           Opcode == ISD::ATOMIC_STORE) &&
+          "Invalid Atomic Op");
+ 
+   EVT VT = Val.getValueType();
+ 
+   SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
+                                                getVTList(VT, MVT::Other);
+   SDValue Ops[] = {Chain, Ptr, Val};
+   return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
+ }
+ 
+ SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                 EVT VT, SDValue Chain, SDValue Ptr,
+                                 MachineMemOperand *MMO) {
+   assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");
+ 
+   SDVTList VTs = getVTList(VT, MVT::Other);
+   SDValue Ops[] = {Chain, Ptr};
+   return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
+ }
+ 
+ /// getMergeValues - Create a MERGE_VALUES node from the given operands.
+ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
+   if (Ops.size() == 1)
+     return Ops[0];
+ 
+   SmallVector<EVT, 4> VTs;
+   VTs.reserve(Ops.size());
+   for (unsigned i = 0; i < Ops.size(); ++i)
+     VTs.push_back(Ops[i].getValueType());
+   return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops);
+ }
+ 
+ SDValue SelectionDAG::getMemIntrinsicNode(
+     unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
+     EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align,
+     MachineMemOperand::Flags Flags, unsigned Size) {
+   if (Align == 0)  // Ensure that codegen never sees alignment 0
+     Align = getEVTAlignment(MemVT);
+ 
+   if (!Size)
+     Size = MemVT.getStoreSize();
+ 
+   MachineFunction &MF = getMachineFunction();
+   MachineMemOperand *MMO =
+     MF.getMachineMemOperand(PtrInfo, Flags, Size, Align);
+ 
+   return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
+ }
+ 
+ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
+                                           SDVTList VTList,
+                                           ArrayRef<SDValue> Ops, EVT MemVT,
+                                           MachineMemOperand *MMO) {
+   assert((Opcode == ISD::INTRINSIC_VOID ||
+           Opcode == ISD::INTRINSIC_W_CHAIN ||
+           Opcode == ISD::PREFETCH ||
+           Opcode == ISD::LIFETIME_START ||
+           Opcode == ISD::LIFETIME_END ||
+           ((int)Opcode <= std::numeric_limits<int>::max() &&
+            (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
+          "Opcode is not a memory-accessing opcode!");
+ 
+   // Memoize the node unless it returns a flag.
+   MemIntrinsicSDNode *N;
+   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
+     FoldingSetNodeID ID;
+     AddNodeIDNode(ID, Opcode, VTList, Ops);
+     ID.AddInteger(getSyntheticNodeSubclassData<MemIntrinsicSDNode>(
+         Opcode, dl.getIROrder(), VTList, MemVT, MMO));
+     ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+     void *IP = nullptr;
+     if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+       cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
+       return SDValue(E, 0);
+     }
+ 
+     N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                       VTList, MemVT, MMO);
+     createOperands(N, Ops);
+ 
+   CSEMap.InsertNode(N, IP);
+   } else {
+     N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                       VTList, MemVT, MMO);
+     createOperands(N, Ops);
+   }
+   InsertNode(N);
+   return SDValue(N, 0);
+ }
+ 
+ /// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
+ /// MachinePointerInfo record from it.  This is particularly useful because the
+ /// code generator has many cases where it doesn't bother passing in a
+ /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
+ static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
+                                            SelectionDAG &DAG, SDValue Ptr,
+                                            int64_t Offset = 0) {
+   // If this is FI+Offset, we can model it.
+   if (const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr))
+     return MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
+                                              FI->getIndex(), Offset);
+ 
+   // If this is (FI+Offset1)+Offset2, we can model it.
+   if (Ptr.getOpcode() != ISD::ADD ||
+       !isa<ConstantSDNode>(Ptr.getOperand(1)) ||
+       !isa<FrameIndexSDNode>(Ptr.getOperand(0)))
+     return Info;
+ 
+   int FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+   return MachinePointerInfo::getFixedStack(
+       DAG.getMachineFunction(), FI,
+       Offset + cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue());
+ }
+ 
+ /// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
+ /// MachinePointerInfo record from it.  This is particularly useful because the
+ /// code generator has many cases where it doesn't bother passing in a
+ /// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
+ static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
+                                            SelectionDAG &DAG, SDValue Ptr,
+                                            SDValue OffsetOp) {
+   // If the 'Offset' value isn't a constant, we can't handle this.
+   if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp))
+     return InferPointerInfo(Info, DAG, Ptr, OffsetNode->getSExtValue());
+   if (OffsetOp.isUndef())
+     return InferPointerInfo(Info, DAG, Ptr);
+   return Info;
+ }
+ 
+ SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                               EVT VT, const SDLoc &dl, SDValue Chain,
+                               SDValue Ptr, SDValue Offset,
+                               MachinePointerInfo PtrInfo, EVT MemVT,
+                               unsigned Alignment,
+                               MachineMemOperand::Flags MMOFlags,
+                               const AAMDNodes &AAInfo, const MDNode *Ranges) {
+   assert(Chain.getValueType() == MVT::Other &&
+         "Invalid chain type");
+   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+     Alignment = getEVTAlignment(MemVT);
+ 
+   MMOFlags |= MachineMemOperand::MOLoad;
+   assert((MMOFlags & MachineMemOperand::MOStore) == 0);
+   // If we don't have a PtrInfo, infer the trivial frame index case to simplify
+   // clients.
+   if (PtrInfo.V.isNull())
+     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
+ 
+   MachineFunction &MF = getMachineFunction();
+   MachineMemOperand *MMO = MF.getMachineMemOperand(
+       PtrInfo, MMOFlags, MemVT.getStoreSize(), Alignment, AAInfo, Ranges);
+   return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
+ }
+ 
+ SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                               EVT VT, const SDLoc &dl, SDValue Chain,
+                               SDValue Ptr, SDValue Offset, EVT MemVT,
+                               MachineMemOperand *MMO) {
+   if (VT == MemVT) {
+     ExtType = ISD::NON_EXTLOAD;
+   } else if (ExtType == ISD::NON_EXTLOAD) {
+     assert(VT == MemVT && "Non-extending load from different memory type!");
+   } else {
+     // Extending load.
+     assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
+            "Should only be an extending load, not truncating!");
+     assert(VT.isInteger() == MemVT.isInteger() &&
+            "Cannot convert from FP to Int or Int -> FP!");
+     assert(VT.isVector() == MemVT.isVector() &&
+            "Cannot use an ext load to convert to or from a vector!");
+     assert((!VT.isVector() ||
+             VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
+            "Cannot use an ext load to change the number of vector elements!");
+   }
+ 
+   bool Indexed = AM != ISD::UNINDEXED;
+   assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
+ 
+   SDVTList VTs = Indexed ?
+     getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
+   SDValue Ops[] = { Chain, Ptr, Offset };
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::LOAD, VTs, Ops);
+   ID.AddInteger(MemVT.getRawBits());
+   ID.AddInteger(getSyntheticNodeSubclassData<LoadSDNode>(
+       dl.getIROrder(), VTs, AM, ExtType, MemVT, MMO));
+   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+     cast<LoadSDNode>(E)->refineAlignment(MMO);
+     return SDValue(E, 0);
+   }
+   auto *N = newSDNode<LoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                   ExtType, MemVT, MMO);
+   createOperands(N, Ops);
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   SDValue V(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                               SDValue Ptr, MachinePointerInfo PtrInfo,
+                               unsigned Alignment,
+                               MachineMemOperand::Flags MMOFlags,
+                               const AAMDNodes &AAInfo, const MDNode *Ranges) {
+   SDValue Undef = getUNDEF(Ptr.getValueType());
+   return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                  PtrInfo, VT, Alignment, MMOFlags, AAInfo, Ranges);
+ }
+ 
+ SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                               SDValue Ptr, MachineMemOperand *MMO) {
+   SDValue Undef = getUNDEF(Ptr.getValueType());
+   return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                  VT, MMO);
+ }
+ 
+ SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                  EVT VT, SDValue Chain, SDValue Ptr,
+                                  MachinePointerInfo PtrInfo, EVT MemVT,
+                                  unsigned Alignment,
+                                  MachineMemOperand::Flags MMOFlags,
+                                  const AAMDNodes &AAInfo) {
+   SDValue Undef = getUNDEF(Ptr.getValueType());
+   return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, PtrInfo,
+                  MemVT, Alignment, MMOFlags, AAInfo);
+ }
+ 
+ SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                  EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT,
+                                  MachineMemOperand *MMO) {
+   SDValue Undef = getUNDEF(Ptr.getValueType());
+   return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
+                  MemVT, MMO);
+ }
+ 
+ SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl,
+                                      SDValue Base, SDValue Offset,
+                                      ISD::MemIndexedMode AM) {
+   LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
+   assert(LD->getOffset().isUndef() && "Load is already a indexed load!");
+   // Don't propagate the invariant or dereferenceable flags.
+   auto MMOFlags =
+       LD->getMemOperand()->getFlags() &
+       ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
+   return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
+                  LD->getChain(), Base, Offset, LD->getPointerInfo(),
+                  LD->getMemoryVT(), LD->getAlignment(), MMOFlags,
+                  LD->getAAInfo());
+ }
+ 
+ SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                                SDValue Ptr, MachinePointerInfo PtrInfo,
+                                unsigned Alignment,
+                                MachineMemOperand::Flags MMOFlags,
+                                const AAMDNodes &AAInfo) {
+   assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+     Alignment = getEVTAlignment(Val.getValueType());
+ 
+   MMOFlags |= MachineMemOperand::MOStore;
+   assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+ 
+   if (PtrInfo.V.isNull())
+     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
+ 
+   MachineFunction &MF = getMachineFunction();
+   MachineMemOperand *MMO = MF.getMachineMemOperand(
+       PtrInfo, MMOFlags, Val.getValueType().getStoreSize(), Alignment, AAInfo);
+   return getStore(Chain, dl, Val, Ptr, MMO);
+ }
+ 
+ SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                                SDValue Ptr, MachineMemOperand *MMO) {
+   assert(Chain.getValueType() == MVT::Other &&
+         "Invalid chain type");
+   EVT VT = Val.getValueType();
+   SDVTList VTs = getVTList(MVT::Other);
+   SDValue Undef = getUNDEF(Ptr.getValueType());
+   SDValue Ops[] = { Chain, Val, Ptr, Undef };
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
+   ID.AddInteger(VT.getRawBits());
+   ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
+       dl.getIROrder(), VTs, ISD::UNINDEXED, false, VT, MMO));
+   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+     cast<StoreSDNode>(E)->refineAlignment(MMO);
+     return SDValue(E, 0);
+   }
+   auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                    ISD::UNINDEXED, false, VT, MMO);
+   createOperands(N, Ops);
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   SDValue V(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                                     SDValue Ptr, MachinePointerInfo PtrInfo,
+                                     EVT SVT, unsigned Alignment,
+                                     MachineMemOperand::Flags MMOFlags,
+                                     const AAMDNodes &AAInfo) {
+   assert(Chain.getValueType() == MVT::Other &&
+         "Invalid chain type");
+   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+     Alignment = getEVTAlignment(SVT);
+ 
+   MMOFlags |= MachineMemOperand::MOStore;
+   assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+ 
+   if (PtrInfo.V.isNull())
+     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
+ 
+   MachineFunction &MF = getMachineFunction();
+   MachineMemOperand *MMO = MF.getMachineMemOperand(
+       PtrInfo, MMOFlags, SVT.getStoreSize(), Alignment, AAInfo);
+   return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
+ }
+ 
+ SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                                     SDValue Ptr, EVT SVT,
+                                     MachineMemOperand *MMO) {
+   EVT VT = Val.getValueType();
+ 
+   assert(Chain.getValueType() == MVT::Other &&
+         "Invalid chain type");
+   if (VT == SVT)
+     return getStore(Chain, dl, Val, Ptr, MMO);
+ 
+   assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
+          "Should only be a truncating store, not extending!");
+   assert(VT.isInteger() == SVT.isInteger() &&
+          "Can't do FP-INT conversion!");
+   assert(VT.isVector() == SVT.isVector() &&
+          "Cannot use trunc store to convert to or from a vector!");
+   assert((!VT.isVector() ||
+           VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
+          "Cannot use trunc store to change the number of vector elements!");
+ 
+   SDVTList VTs = getVTList(MVT::Other);
+   SDValue Undef = getUNDEF(Ptr.getValueType());
+   SDValue Ops[] = { Chain, Val, Ptr, Undef };
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
+   ID.AddInteger(SVT.getRawBits());
+   ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
+       dl.getIROrder(), VTs, ISD::UNINDEXED, true, SVT, MMO));
+   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+     cast<StoreSDNode>(E)->refineAlignment(MMO);
+     return SDValue(E, 0);
+   }
+   auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                    ISD::UNINDEXED, true, SVT, MMO);
+   createOperands(N, Ops);
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   SDValue V(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
+                                       SDValue Base, SDValue Offset,
+                                       ISD::MemIndexedMode AM) {
+   StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
+   assert(ST->getOffset().isUndef() && "Store is already a indexed store!");
+   SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
+   SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
+   ID.AddInteger(ST->getMemoryVT().getRawBits());
+   ID.AddInteger(ST->getRawSubclassData());
+   ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+     return SDValue(E, 0);
+ 
+   auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                    ST->isTruncatingStore(), ST->getMemoryVT(),
+                                    ST->getMemOperand());
+   createOperands(N, Ops);
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   SDValue V(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                                     SDValue Ptr, SDValue Mask, SDValue PassThru,
+                                     EVT MemVT, MachineMemOperand *MMO,
+                                     ISD::LoadExtType ExtTy, bool isExpanding) {
+   SDVTList VTs = getVTList(VT, MVT::Other);
+   SDValue Ops[] = { Chain, Ptr, Mask, PassThru };
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
+   ID.AddInteger(VT.getRawBits());
+   ID.AddInteger(getSyntheticNodeSubclassData<MaskedLoadSDNode>(
+       dl.getIROrder(), VTs, ExtTy, isExpanding, MemVT, MMO));
+   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+     cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
+     return SDValue(E, 0);
+   }
+   auto *N = newSDNode<MaskedLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                         ExtTy, isExpanding, MemVT, MMO);
+   createOperands(N, Ops);
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   SDValue V(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
+                                      SDValue Val, SDValue Ptr, SDValue Mask,
+                                      EVT MemVT, MachineMemOperand *MMO,
+                                      bool IsTruncating, bool IsCompressing) {
+   assert(Chain.getValueType() == MVT::Other &&
+         "Invalid chain type");
+   EVT VT = Val.getValueType();
+   SDVTList VTs = getVTList(MVT::Other);
+   SDValue Ops[] = { Chain, Val, Ptr, Mask };
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
+   ID.AddInteger(VT.getRawBits());
+   ID.AddInteger(getSyntheticNodeSubclassData<MaskedStoreSDNode>(
+       dl.getIROrder(), VTs, IsTruncating, IsCompressing, MemVT, MMO));
+   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+     cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
+     return SDValue(E, 0);
+   }
+   auto *N = newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                          IsTruncating, IsCompressing, MemVT, MMO);
+   createOperands(N, Ops);
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   SDValue V(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
+                                       ArrayRef<SDValue> Ops,
+                                       MachineMemOperand *MMO) {
+   assert(Ops.size() == 6 && "Incompatible number of operands");
+ 
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
+   ID.AddInteger(VT.getRawBits());
+   ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
+       dl.getIROrder(), VTs, VT, MMO));
+   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+     cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
+     return SDValue(E, 0);
+   }
+ 
+   auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                           VTs, VT, MMO);
+   createOperands(N, Ops);
+ 
+   assert(N->getPassThru().getValueType() == N->getValueType(0) &&
+          "Incompatible type of the PassThru value in MaskedGatherSDNode");
+   assert(N->getMask().getValueType().getVectorNumElements() ==
+              N->getValueType(0).getVectorNumElements() &&
+          "Vector width mismatch between mask and data");
+   assert(N->getIndex().getValueType().getVectorNumElements() >=
+              N->getValueType(0).getVectorNumElements() &&
+          "Vector width mismatch between index and data");
+   assert(isa<ConstantSDNode>(N->getScale()) &&
+          cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
+          "Scale should be a constant power of 2");
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   SDValue V(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
+                                        ArrayRef<SDValue> Ops,
+                                        MachineMemOperand *MMO) {
+   assert(Ops.size() == 6 && "Incompatible number of operands");
+ 
+   FoldingSetNodeID ID;
+   AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
+   ID.AddInteger(VT.getRawBits());
+   ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
+       dl.getIROrder(), VTs, VT, MMO));
+   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+   void *IP = nullptr;
+   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+     cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
+     return SDValue(E, 0);
+   }
+   auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                            VTs, VT, MMO);
+   createOperands(N, Ops);
+ 
+   assert(N->getMask().getValueType().getVectorNumElements() ==
+              N->getValue().getValueType().getVectorNumElements() &&
+          "Vector width mismatch between mask and data");
+   assert(N->getIndex().getValueType().getVectorNumElements() >=
+              N->getValue().getValueType().getVectorNumElements() &&
+          "Vector width mismatch between index and data");
+   assert(isa<ConstantSDNode>(N->getScale()) &&
+          cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
+          "Scale should be a constant power of 2");
+ 
+   CSEMap.InsertNode(N, IP);
+   InsertNode(N);
+   SDValue V(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) {
+   // select undef, T, F --> T (if T is a constant), otherwise F
+   // select, ?, undef, F --> F
+   // select, ?, T, undef --> T
+   if (Cond.isUndef())
+     return isConstantValueOfAnyType(T) ? T : F;
+   if (T.isUndef())
+     return F;
+   if (F.isUndef())
+     return T;
+ 
+   // select true, T, F --> T
+   // select false, T, F --> F
+   if (auto *CondC = dyn_cast<ConstantSDNode>(Cond))
+     return CondC->isNullValue() ? F : T;
+ 
+   // TODO: This should simplify VSELECT with constant condition using something
+   // like this (but check boolean contents to be complete?):
+   //  if (ISD::isBuildVectorAllOnes(Cond.getNode()))
+   //    return T;
+   //  if (ISD::isBuildVectorAllZeros(Cond.getNode()))
+   //    return F;
+ 
+   // select ?, T, T --> T
+   if (T == F)
+     return T;
+ 
+   return SDValue();
+ }
+ 
+ SDValue SelectionDAG::simplifyShift(SDValue X, SDValue Y) {
+   // shift undef, Y --> 0 (can always assume that the undef value is 0)
+   if (X.isUndef())
+     return getConstant(0, SDLoc(X.getNode()), X.getValueType());
+   // shift X, undef --> undef (because it may shift by the bitwidth)
+   if (Y.isUndef())
+     return getUNDEF(X.getValueType());
+ 
+   // shift 0, Y --> 0
+   // shift X, 0 --> X
+   if (isNullOrNullSplat(X) || isNullOrNullSplat(Y))
+     return X;
+ 
+   // shift X, C >= bitwidth(X) --> undef
+   // All vector elements must be too big to avoid partial undefs.
+   auto isShiftTooBig = [X](ConstantSDNode *Val) {
+     return Val->getAPIntValue().uge(X.getScalarValueSizeInBits());
+   };
+   if (ISD::matchUnaryPredicate(Y, isShiftTooBig))
+     return getUNDEF(X.getValueType());
+ 
+   return SDValue();
+ }
+ 
+ SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
+                                SDValue Ptr, SDValue SV, unsigned Align) {
+   SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) };
+   return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops);
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                               ArrayRef<SDUse> Ops) {
+   switch (Ops.size()) {
+   case 0: return getNode(Opcode, DL, VT);
+   case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
+   case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
+   case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
+   default: break;
+   }
+ 
+   // Copy from an SDUse array into an SDValue array for use with
+   // the regular getNode logic.
+   SmallVector<SDValue, 8> NewOps(Ops.begin(), Ops.end());
+   return getNode(Opcode, DL, VT, NewOps);
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                               ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
+   unsigned NumOps = Ops.size();
+   switch (NumOps) {
+   case 0: return getNode(Opcode, DL, VT);
+   case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
+   case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
+   case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2], Flags);
+   default: break;
+   }
+ 
+   switch (Opcode) {
+   default: break;
+   case ISD::BUILD_VECTOR:
+     // Attempt to simplify BUILD_VECTOR.
+     if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+       return V;
+     break;
+   case ISD::CONCAT_VECTORS:
+     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+     if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+       return V;
+     break;
+   case ISD::SELECT_CC:
+     assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
+     assert(Ops[0].getValueType() == Ops[1].getValueType() &&
+            "LHS and RHS of condition must have same type!");
+     assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+            "True and False arms of SelectCC must have same type!");
+     assert(Ops[2].getValueType() == VT &&
+            "select_cc node must be of same type as true and false value!");
+     break;
+   case ISD::BR_CC:
+     assert(NumOps == 5 && "BR_CC takes 5 operands!");
+     assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+            "LHS/RHS of comparison should match types!");
+     break;
+   }
+ 
+   // Memoize nodes.
+   SDNode *N;
+   SDVTList VTs = getVTList(VT);
+ 
+   if (VT != MVT::Glue) {
+     FoldingSetNodeID ID;
+     AddNodeIDNode(ID, Opcode, VTs, Ops);
+     void *IP = nullptr;
+ 
+     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+       return SDValue(E, 0);
+ 
+     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+     createOperands(N, Ops);
+ 
+     CSEMap.InsertNode(N, IP);
+   } else {
+     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+     createOperands(N, Ops);
+   }
+ 
+   InsertNode(N);
+   SDValue V(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
+                               ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
+   return getNode(Opcode, DL, getVTList(ResultTys), Ops);
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                               ArrayRef<SDValue> Ops) {
+   if (VTList.NumVTs == 1)
+     return getNode(Opcode, DL, VTList.VTs[0], Ops);
+ 
+ #if 0
+   switch (Opcode) {
+   // FIXME: figure out how to safely handle things like
+   // int foo(int x) { return 1 << (x & 255); }
+   // int bar() { return foo(256); }
+   case ISD::SRA_PARTS:
+   case ISD::SRL_PARTS:
+   case ISD::SHL_PARTS:
+     if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+         cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
+       return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+     else if (N3.getOpcode() == ISD::AND)
+       if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
+         // If the and is only masking out bits that cannot effect the shift,
+         // eliminate the and.
+         unsigned NumBits = VT.getScalarSizeInBits()*2;
+         if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
+           return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+       }
+     break;
+   }
+ #endif
+ 
+   // Memoize the node unless it returns a flag.
+   SDNode *N;
+   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
+     FoldingSetNodeID ID;
+     AddNodeIDNode(ID, Opcode, VTList, Ops);
+     void *IP = nullptr;
+     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+       return SDValue(E, 0);
+ 
+     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
+     createOperands(N, Ops);
+     CSEMap.InsertNode(N, IP);
+   } else {
+     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
+     createOperands(N, Ops);
+   }
+   InsertNode(N);
+   SDValue V(N, 0);
+   NewSDValueDbgMsg(V, "Creating new node: ", this);
+   return V;
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
+                               SDVTList VTList) {
+   return getNode(Opcode, DL, VTList, None);
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                               SDValue N1) {
+   SDValue Ops[] = { N1 };
+   return getNode(Opcode, DL, VTList, Ops);
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                               SDValue N1, SDValue N2) {
+   SDValue Ops[] = { N1, N2 };
+   return getNode(Opcode, DL, VTList, Ops);
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                               SDValue N1, SDValue N2, SDValue N3) {
+   SDValue Ops[] = { N1, N2, N3 };
+   return getNode(Opcode, DL, VTList, Ops);
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                               SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
+   SDValue Ops[] = { N1, N2, N3, N4 };
+   return getNode(Opcode, DL, VTList, Ops);
+ }
+ 
+ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                               SDValue N1, SDValue N2, SDValue N3, SDValue N4,
+                               SDValue N5) {
+   SDValue Ops[] = { N1, N2, N3, N4, N5 };
+   return getNode(Opcode, DL, VTList, Ops);
+ }
+ 
+ SDVTList SelectionDAG::getVTList(EVT VT) {
+   return makeVTList(SDNode::getValueTypeList(VT), 1);
+ }
+ 
+ SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
+   FoldingSetNodeID ID;
+   ID.AddInteger(2U);
+   ID.AddInteger(VT1.getRawBits());
+   ID.AddInteger(VT2.getRawBits());
+ 
+   void *IP = nullptr;
+   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+   if (!Result) {
+     EVT *Array = Allocator.Allocate<EVT>(2);
+     Array[0] = VT1;
+     Array[1] = VT2;
+     Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2);
+     VTListMap.InsertNode(Result, IP);
+   }
+   return Result->getSDVTList();
+ }
+ 
+ SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
+   FoldingSetNodeID ID;
+   ID.AddInteger(3U);
+   ID.AddInteger(VT1.getRawBits());
+   ID.AddInteger(VT2.getRawBits());
+   ID.AddInteger(VT3.getRawBits());
+ 
+   void *IP = nullptr;
+   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+   if (!Result) {
+     EVT *Array = Allocator.Allocate<EVT>(3);
+     Array[0] = VT1;
+     Array[1] = VT2;
+     Array[2] = VT3;
+     Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3);
+     VTListMap.InsertNode(Result, IP);
+   }
+   return Result->getSDVTList();
+ }
+ 
+ SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
+   FoldingSetNodeID ID;
+   ID.AddInteger(4U);
+   ID.AddInteger(VT1.getRawBits());
+   ID.AddInteger(VT2.getRawBits());
+   ID.AddInteger(VT3.getRawBits());
+   ID.AddInteger(VT4.getRawBits());
+ 
+   void *IP = nullptr;
+   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+   if (!Result) {
+     EVT *Array = Allocator.Allocate<EVT>(4);
+     Array[0] = VT1;
+     Array[1] = VT2;
+     Array[2] = VT3;
+     Array[3] = VT4;
+     Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4);
+     VTListMap.InsertNode(Result, IP);
+   }
+   return Result->getSDVTList();
+ }
+ 
+ SDVTList SelectionDAG::getVTList(ArrayRef<EVT> VTs) {
+   unsigned NumVTs = VTs.size();
+   FoldingSetNodeID ID;
+   ID.AddInteger(NumVTs);
+   for (unsigned index = 0; index < NumVTs; index++) {
+     ID.AddInteger(VTs[index].getRawBits());
+   }
+ 
+   void *IP = nullptr;
+   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+   if (!Result) {
+     EVT *Array = Allocator.Allocate<EVT>(NumVTs);
+     llvm::copy(VTs, Array);
+     Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
+     VTListMap.InsertNode(Result, IP);
+   }
+   return Result->getSDVTList();
+ }
+ 
+ 
+ /// UpdateNodeOperands - *Mutate* the specified node in-place to have the
+ /// specified operands.  If the resultant node already exists in the DAG,
+ /// this does not modify the specified node, instead it returns the node that
+ /// already exists.  If the resultant node does not exist in the DAG, the
+ /// input node is returned.  As a degenerate case, if you specify the same
+ /// input operands as the node already has, the input node is returned.
+ SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) {
+   assert(N->getNumOperands() == 1 && "Update with wrong number of operands");
+ 
+   // Check to see if there is no change.
+   if (Op == N->getOperand(0)) return N;
+ 
+   // See if the modified node already exists.
+   void *InsertPos = nullptr;
+   if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
+     return Existing;
+ 
+   // Nope it doesn't.  Remove the node from its current place in the maps.
+   if (InsertPos)
+     if (!RemoveNodeFromCSEMaps(N))
+       InsertPos = nullptr;
+ 
+   // Now we update the operands.
+   N->OperandList[0].set(Op);
+ 
+   updateDivergence(N);
+   // If this gets put into a CSE map, add it.
+   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+   return N;
+ }
+ 
+ SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
+   assert(N->getNumOperands() == 2 && "Update with wrong number of operands");
+ 
+   // Check to see if there is no change.
+   if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
+     return N;   // No operands changed, just return the input node.
+ 
+   // See if the modified node already exists.
+   void *InsertPos = nullptr;
+   if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
+     return Existing;
+ 
+   // Nope it doesn't.  Remove the node from its current place in the maps.
+   if (InsertPos)
+     if (!RemoveNodeFromCSEMaps(N))
+       InsertPos = nullptr;
+ 
+   // Now we update the operands.
+   if (N->OperandList[0] != Op1)
+     N->OperandList[0].set(Op1);
+   if (N->OperandList[1] != Op2)
+     N->OperandList[1].set(Op2);
+ 
+   updateDivergence(N);
+   // If this gets put into a CSE map, add it.
+   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+   return N;
+ }
+ 
+ SDNode *SelectionDAG::
+ UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
+   SDValue Ops[] = { Op1, Op2, Op3 };
+   return UpdateNodeOperands(N, Ops);
+ }
+ 
+ SDNode *SelectionDAG::
+ UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                    SDValue Op3, SDValue Op4) {
+   SDValue Ops[] = { Op1, Op2, Op3, Op4 };
+   return UpdateNodeOperands(N, Ops);
+ }
+ 
+ SDNode *SelectionDAG::
+ UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                    SDValue Op3, SDValue Op4, SDValue Op5) {
+   SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
+   return UpdateNodeOperands(N, Ops);
+ }
+ 
+ SDNode *SelectionDAG::
+ UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) {
+   unsigned NumOps = Ops.size();
+   assert(N->getNumOperands() == NumOps &&
+          "Update with wrong number of operands");
+ 
+   // If no operands changed just return the input node.
+   if (std::equal(Ops.begin(), Ops.end(), N->op_begin()))
+     return N;
+ 
+   // See if the modified node already exists.
+   void *InsertPos = nullptr;
+   if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, InsertPos))
+     return Existing;
+ 
+   // Nope it doesn't.  Remove the node from its current place in the maps.
+   if (InsertPos)
+     if (!RemoveNodeFromCSEMaps(N))
+       InsertPos = nullptr;
+ 
+   // Now we update the operands.
+   for (unsigned i = 0; i != NumOps; ++i)
+     if (N->OperandList[i] != Ops[i])
+       N->OperandList[i].set(Ops[i]);
+ 
+   updateDivergence(N);
+   // If this gets put into a CSE map, add it.
+   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+   return N;
+ }
+ 
+ /// DropOperands - Release the operands and set this node to have
+ /// zero operands.
+ void SDNode::DropOperands() {
+   // Unlike the code in MorphNodeTo that does this, we don't need to
+   // watch for dead nodes here.
+   for (op_iterator I = op_begin(), E = op_end(); I != E; ) {
+     SDUse &Use = *I++;
+     Use.set(SDValue());
+   }
+ }
+ 
+ void SelectionDAG::setNodeMemRefs(MachineSDNode *N,
+                                   ArrayRef<MachineMemOperand *> NewMemRefs) {
+   if (NewMemRefs.empty()) {
+     N->clearMemRefs();
+     return;
+   }
+ 
+   // Check if we can avoid allocating by storing a single reference directly.
+   if (NewMemRefs.size() == 1) {
+     N->MemRefs = NewMemRefs[0];
+     N->NumMemRefs = 1;
+     return;
+   }
+ 
+   MachineMemOperand **MemRefsBuffer =
+       Allocator.template Allocate<MachineMemOperand *>(NewMemRefs.size());
+   llvm::copy(NewMemRefs, MemRefsBuffer);
+   N->MemRefs = MemRefsBuffer;
+   N->NumMemRefs = static_cast<int>(NewMemRefs.size());
+ }
+ 
+ /// SelectNodeTo - These are wrappers around MorphNodeTo that accept a
+ /// machine opcode.
+ ///
+ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                    EVT VT) {
+   SDVTList VTs = getVTList(VT);
+   return SelectNodeTo(N, MachineOpc, VTs, None);
+ }
+ 
+ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                    EVT VT, SDValue Op1) {
+   SDVTList VTs = getVTList(VT);
+   SDValue Ops[] = { Op1 };
+   return SelectNodeTo(N, MachineOpc, VTs, Ops);
+ }
+ 
+ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                    EVT VT, SDValue Op1,
+                                    SDValue Op2) {
+   SDVTList VTs = getVTList(VT);
+   SDValue Ops[] = { Op1, Op2 };
+   return SelectNodeTo(N, MachineOpc, VTs, Ops);
+ }
+ 
+ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                    EVT VT, SDValue Op1,
+                                    SDValue Op2, SDValue Op3) {
+   SDVTList VTs = getVTList(VT);
+   SDValue Ops[] = { Op1, Op2, Op3 };
+   return SelectNodeTo(N, MachineOpc, VTs, Ops);
+ }
+ 
+ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                    EVT VT, ArrayRef<SDValue> Ops) {
+   SDVTList VTs = getVTList(VT);
+   return SelectNodeTo(N, MachineOpc, VTs, Ops);
+ }
+ 
+ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                    EVT VT1, EVT VT2, ArrayRef<SDValue> Ops) {
+   SDVTList VTs = getVTList(VT1, VT2);
+   return SelectNodeTo(N, MachineOpc, VTs, Ops);
+ }
+ 
+ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                    EVT VT1, EVT VT2) {
+   SDVTList VTs = getVTList(VT1, VT2);
+   return SelectNodeTo(N, MachineOpc, VTs, None);
+ }
+ 
+ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                    EVT VT1, EVT VT2, EVT VT3,
+                                    ArrayRef<SDValue> Ops) {
+   SDVTList VTs = getVTList(VT1, VT2, VT3);
+   return SelectNodeTo(N, MachineOpc, VTs, Ops);
+ }
+ 
+ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                    EVT VT1, EVT VT2,
+                                    SDValue Op1, SDValue Op2) {
+   SDVTList VTs = getVTList(VT1, VT2);
+   SDValue Ops[] = { Op1, Op2 };
+   return SelectNodeTo(N, MachineOpc, VTs, Ops);
+ }
+ 
+ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                    SDVTList VTs,ArrayRef<SDValue> Ops) {
+   SDNode *New = MorphNodeTo(N, ~MachineOpc, VTs, Ops);
+   // Reset the NodeID to -1.
+   New->setNodeId(-1);
+   if (New != N) {
+     ReplaceAllUsesWith(N, New);
+     RemoveDeadNode(N);
+   }
+   return New;
+ }
+ 
+ /// UpdateSDLocOnMergeSDNode - If the opt level is -O0 then it throws away
+ /// the line number information on the merged node since it is not possible to
+ /// preserve the information that operation is associated with multiple lines.
+ /// This will make the debugger working better at -O0, were there is a higher
+ /// probability having other instructions associated with that line.
+ ///
+ /// For IROrder, we keep the smaller of the two
+ SDNode *SelectionDAG::UpdateSDLocOnMergeSDNode(SDNode *N, const SDLoc &OLoc) {
+   DebugLoc NLoc = N->getDebugLoc();
+   if (NLoc && OptLevel == CodeGenOpt::None && OLoc.getDebugLoc() != NLoc) {
+     N->setDebugLoc(DebugLoc());
+   }
+   unsigned Order = std::min(N->getIROrder(), OLoc.getIROrder());
+   N->setIROrder(Order);
+   return N;
+ }
+ 
+ /// MorphNodeTo - This *mutates* the specified node to have the specified
+ /// return type, opcode, and operands.
+ ///
+ /// Note that MorphNodeTo returns the resultant node.  If there is already a
+ /// node of the specified opcode and operands, it returns that node instead of
+ /// the current one.  Note that the SDLoc need not be the same.
+ ///
+ /// Using MorphNodeTo is faster than creating a new node and swapping it in
+ /// with ReplaceAllUsesWith both because it often avoids allocating a new
+ /// node, and because it doesn't require CSE recalculation for any of
+ /// the node's users.
+ ///
+ /// However, note that MorphNodeTo recursively deletes dead nodes from the DAG.
+ /// As a consequence it isn't appropriate to use from within the DAG combiner or
+ /// the legalizer which maintain worklists that would need to be updated when
+ /// deleting things.
+ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                   SDVTList VTs, ArrayRef<SDValue> Ops) {
+   // If an identical node already exists, use it.
+   void *IP = nullptr;
+   if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) {
+     FoldingSetNodeID ID;
+     AddNodeIDNode(ID, Opc, VTs, Ops);
+     if (SDNode *ON = FindNodeOrInsertPos(ID, SDLoc(N), IP))
+       return UpdateSDLocOnMergeSDNode(ON, SDLoc(N));
+   }
+ 
+   if (!RemoveNodeFromCSEMaps(N))
+     IP = nullptr;
+ 
+   // Start the morphing.
+   N->NodeType = Opc;
+   N->ValueList = VTs.VTs;
+   N->NumValues = VTs.NumVTs;
+ 
+   // Clear the operands list, updating used nodes to remove this from their
+   // use list.  Keep track of any operands that become dead as a result.
+   SmallPtrSet<SDNode*, 16> DeadNodeSet;
+   for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+     SDUse &Use = *I++;
+     SDNode *Used = Use.getNode();
+     Use.set(SDValue());
+     if (Used->use_empty())
+       DeadNodeSet.insert(Used);
+   }
+ 
+   // For MachineNode, initialize the memory references information.
+   if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N))
+     MN->clearMemRefs();
+ 
+   // Swap for an appropriately sized array from the recycler.
+   removeOperands(N);
+   createOperands(N, Ops);
+ 
+   // Delete any nodes that are still dead after adding the uses for the
+   // new operands.
+   if (!DeadNodeSet.empty()) {
+     SmallVector<SDNode *, 16> DeadNodes;
+     for (SDNode *N : DeadNodeSet)
+       if (N->use_empty())
+         DeadNodes.push_back(N);
+     RemoveDeadNodes(DeadNodes);
+   }
+ 
+   if (IP)
+     CSEMap.InsertNode(N, IP);   // Memoize the new node.
+   return N;
+ }
+ 
+ SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
+   unsigned OrigOpc = Node->getOpcode();
+   unsigned NewOpc;
+   bool IsUnary = false;
+   bool IsTernary = false;
+   switch (OrigOpc) {
+   default:
+     llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
+   case ISD::STRICT_FADD: NewOpc = ISD::FADD; break;
+   case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; break;
+   case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break;
+   case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break;
+   case ISD::STRICT_FREM: NewOpc = ISD::FREM; break;
+   case ISD::STRICT_FMA: NewOpc = ISD::FMA; IsTernary = true; break;
+   case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break;
+   case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break;
+   case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break;
+   case ISD::STRICT_FSIN: NewOpc = ISD::FSIN; IsUnary = true; break;
+   case ISD::STRICT_FCOS: NewOpc = ISD::FCOS; IsUnary = true; break;
+   case ISD::STRICT_FEXP: NewOpc = ISD::FEXP; IsUnary = true; break;
+   case ISD::STRICT_FEXP2: NewOpc = ISD::FEXP2; IsUnary = true; break;
+   case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; IsUnary = true; break;
+   case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; IsUnary = true; break;
+   case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; IsUnary = true; break;
+   case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; IsUnary = true; break;
+   case ISD::STRICT_FNEARBYINT:
+     NewOpc = ISD::FNEARBYINT;
+     IsUnary = true;
+     break;
+   case ISD::STRICT_FMAXNUM: NewOpc = ISD::FMAXNUM; break;
+   case ISD::STRICT_FMINNUM: NewOpc = ISD::FMINNUM; break;
+   case ISD::STRICT_FCEIL: NewOpc = ISD::FCEIL; IsUnary = true; break;
+   case ISD::STRICT_FFLOOR: NewOpc = ISD::FFLOOR; IsUnary = true; break;
+   case ISD::STRICT_FROUND: NewOpc = ISD::FROUND; IsUnary = true; break;
+   case ISD::STRICT_FTRUNC: NewOpc = ISD::FTRUNC; IsUnary = true; break;
+   }
+ 
+   // We're taking this node out of the chain, so we need to re-link things.
+   SDValue InputChain = Node->getOperand(0);
+   SDValue OutputChain = SDValue(Node, 1);
+   ReplaceAllUsesOfValueWith(OutputChain, InputChain);
+ 
+   SDVTList VTs = getVTList(Node->getOperand(1).getValueType());
+   SDNode *Res = nullptr;
+   if (IsUnary)
+     Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) });
+   else if (IsTernary)
+     Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
+                                            Node->getOperand(2),
+                                            Node->getOperand(3)});
+   else
+     Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
+                                            Node->getOperand(2) });
+ 
+   // MorphNodeTo can operate in two ways: if an existing node with the
+   // specified operands exists, it can just return it.  Otherwise, it
+   // updates the node in place to have the requested operands.
+   if (Res == Node) {
+     // If we updated the node in place, reset the node ID.  To the isel,
+     // this should be just like a newly allocated machine node.
+     Res->setNodeId(-1);
+   } else {
+     ReplaceAllUsesWith(Node, Res);
+     RemoveDeadNode(Node);
+   }
+ 
+   return Res;
+ }
+ 
+ /// getMachineNode - These are used for target selectors to create a new node
+ /// with specified return type(s), MachineInstr opcode, and operands.
+ ///
+ /// Note that getMachineNode returns the resultant node.  If there is already a
+ /// node of the specified opcode and operands, it returns that node instead of
+ /// the current one.
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             EVT VT) {
+   SDVTList VTs = getVTList(VT);
+   return getMachineNode(Opcode, dl, VTs, None);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             EVT VT, SDValue Op1) {
+   SDVTList VTs = getVTList(VT);
+   SDValue Ops[] = { Op1 };
+   return getMachineNode(Opcode, dl, VTs, Ops);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             EVT VT, SDValue Op1, SDValue Op2) {
+   SDVTList VTs = getVTList(VT);
+   SDValue Ops[] = { Op1, Op2 };
+   return getMachineNode(Opcode, dl, VTs, Ops);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             EVT VT, SDValue Op1, SDValue Op2,
+                                             SDValue Op3) {
+   SDVTList VTs = getVTList(VT);
+   SDValue Ops[] = { Op1, Op2, Op3 };
+   return getMachineNode(Opcode, dl, VTs, Ops);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             EVT VT, ArrayRef<SDValue> Ops) {
+   SDVTList VTs = getVTList(VT);
+   return getMachineNode(Opcode, dl, VTs, Ops);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             EVT VT1, EVT VT2, SDValue Op1,
+                                             SDValue Op2) {
+   SDVTList VTs = getVTList(VT1, VT2);
+   SDValue Ops[] = { Op1, Op2 };
+   return getMachineNode(Opcode, dl, VTs, Ops);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             EVT VT1, EVT VT2, SDValue Op1,
+                                             SDValue Op2, SDValue Op3) {
+   SDVTList VTs = getVTList(VT1, VT2);
+   SDValue Ops[] = { Op1, Op2, Op3 };
+   return getMachineNode(Opcode, dl, VTs, Ops);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             EVT VT1, EVT VT2,
+                                             ArrayRef<SDValue> Ops) {
+   SDVTList VTs = getVTList(VT1, VT2);
+   return getMachineNode(Opcode, dl, VTs, Ops);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             EVT VT1, EVT VT2, EVT VT3,
+                                             SDValue Op1, SDValue Op2) {
+   SDVTList VTs = getVTList(VT1, VT2, VT3);
+   SDValue Ops[] = { Op1, Op2 };
+   return getMachineNode(Opcode, dl, VTs, Ops);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             EVT VT1, EVT VT2, EVT VT3,
+                                             SDValue Op1, SDValue Op2,
+                                             SDValue Op3) {
+   SDVTList VTs = getVTList(VT1, VT2, VT3);
+   SDValue Ops[] = { Op1, Op2, Op3 };
+   return getMachineNode(Opcode, dl, VTs, Ops);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             EVT VT1, EVT VT2, EVT VT3,
+                                             ArrayRef<SDValue> Ops) {
+   SDVTList VTs = getVTList(VT1, VT2, VT3);
+   return getMachineNode(Opcode, dl, VTs, Ops);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                             ArrayRef<EVT> ResultTys,
+                                             ArrayRef<SDValue> Ops) {
+   SDVTList VTs = getVTList(ResultTys);
+   return getMachineNode(Opcode, dl, VTs, Ops);
+ }
+ 
+ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &DL,
+                                             SDVTList VTs,
+                                             ArrayRef<SDValue> Ops) {
+   bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
+   MachineSDNode *N;
+   void *IP = nullptr;
+ 
+   if (DoCSE) {
+     FoldingSetNodeID ID;
+     AddNodeIDNode(ID, ~Opcode, VTs, Ops);
+     IP = nullptr;
+     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+       return cast<MachineSDNode>(UpdateSDLocOnMergeSDNode(E, DL));
+     }
+   }
+ 
+   // Allocate a new MachineSDNode.
+   N = newSDNode<MachineSDNode>(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+   createOperands(N, Ops);
+ 
+   if (DoCSE)
+     CSEMap.InsertNode(N, IP);
+ 
+   InsertNode(N);
+   return N;
+ }
+ 
+ /// getTargetExtractSubreg - A convenience function for creating
+ /// TargetOpcode::EXTRACT_SUBREG nodes.
+ SDValue SelectionDAG::getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
+                                              SDValue Operand) {
+   SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
+   SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                   VT, Operand, SRIdxVal);
+   return SDValue(Subreg, 0);
+ }
+ 
+ /// getTargetInsertSubreg - A convenience function for creating
+ /// TargetOpcode::INSERT_SUBREG nodes.
+ SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
+                                             SDValue Operand, SDValue Subreg) {
+   SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
+   SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+                                   VT, Operand, Subreg, SRIdxVal);
+   return SDValue(Result, 0);
+ }
+ 
+ /// getNodeIfExists - Get the specified node if it's already available, or
+ /// else return NULL.
+ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
+                                       ArrayRef<SDValue> Ops,
+                                       const SDNodeFlags Flags) {
+   if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
+     FoldingSetNodeID ID;
+     AddNodeIDNode(ID, Opcode, VTList, Ops);
+     void *IP = nullptr;
+     if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
+       E->intersectFlagsWith(Flags);
+       return E;
+     }
+   }
+   return nullptr;
+ }
+ 
+ /// getDbgValue - Creates a SDDbgValue node.
+ ///
+ /// SDNode
+ SDDbgValue *SelectionDAG::getDbgValue(DIVariable *Var, DIExpression *Expr,
+                                       SDNode *N, unsigned R, bool IsIndirect,
+                                       const DebugLoc &DL, unsigned O) {
+   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+          "Expected inlined-at fields to agree");
+   return new (DbgInfo->getAlloc())
+       SDDbgValue(Var, Expr, N, R, IsIndirect, DL, O);
+ }
+ 
+ /// Constant
+ SDDbgValue *SelectionDAG::getConstantDbgValue(DIVariable *Var,
+                                               DIExpression *Expr,
+                                               const Value *C,
+                                               const DebugLoc &DL, unsigned O) {
+   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+          "Expected inlined-at fields to agree");
+   return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, C, DL, O);
+ }
+ 
+ /// FrameIndex
+ SDDbgValue *SelectionDAG::getFrameIndexDbgValue(DIVariable *Var,
+                                                 DIExpression *Expr, unsigned FI,
+                                                 bool IsIndirect,
+                                                 const DebugLoc &DL,
+                                                 unsigned O) {
+   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+          "Expected inlined-at fields to agree");
+   return new (DbgInfo->getAlloc())
+       SDDbgValue(Var, Expr, FI, IsIndirect, DL, O, SDDbgValue::FRAMEIX);
+ }
+ 
+ /// VReg
+ SDDbgValue *SelectionDAG::getVRegDbgValue(DIVariable *Var,
+                                           DIExpression *Expr,
+                                           unsigned VReg, bool IsIndirect,
+                                           const DebugLoc &DL, unsigned O) {
+   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+          "Expected inlined-at fields to agree");
+   return new (DbgInfo->getAlloc())
+       SDDbgValue(Var, Expr, VReg, IsIndirect, DL, O, SDDbgValue::VREG);
+ }
+ 
+ void SelectionDAG::transferDbgValues(SDValue From, SDValue To,
+                                      unsigned OffsetInBits, unsigned SizeInBits,
+                                      bool InvalidateDbg) {
+   SDNode *FromNode = From.getNode();
+   SDNode *ToNode = To.getNode();
+   assert(FromNode && ToNode && "Can't modify dbg values");
+ 
+   // PR35338
+   // TODO: assert(From != To && "Redundant dbg value transfer");
+   // TODO: assert(FromNode != ToNode && "Intranode dbg value transfer");
+   if (From == To || FromNode == ToNode)
+     return;
+ 
+   if (!FromNode->getHasDebugValue())
+     return;
+ 
+   SmallVector<SDDbgValue *, 2> ClonedDVs;
+   for (SDDbgValue *Dbg : GetDbgValues(FromNode)) {
+     if (Dbg->getKind() != SDDbgValue::SDNODE || Dbg->isInvalidated())
+       continue;
+ 
+     // TODO: assert(!Dbg->isInvalidated() && "Transfer of invalid dbg value");
+ 
+     // Just transfer the dbg value attached to From.
+     if (Dbg->getResNo() != From.getResNo())
+       continue;
+ 
+     DIVariable *Var = Dbg->getVariable();
+     auto *Expr = Dbg->getExpression();
+     // If a fragment is requested, update the expression.
+     if (SizeInBits) {
+       // When splitting a larger (e.g., sign-extended) value whose
+       // lower bits are described with an SDDbgValue, do not attempt
+       // to transfer the SDDbgValue to the upper bits.
+       if (auto FI = Expr->getFragmentInfo())
+         if (OffsetInBits + SizeInBits > FI->SizeInBits)
+           continue;
+       auto Fragment = DIExpression::createFragmentExpression(Expr, OffsetInBits,
+                                                              SizeInBits);
+       if (!Fragment)
+         continue;
+       Expr = *Fragment;
+     }
+     // Clone the SDDbgValue and move it to To.
+     SDDbgValue *Clone =
+         getDbgValue(Var, Expr, ToNode, To.getResNo(), Dbg->isIndirect(),
+                     Dbg->getDebugLoc(), Dbg->getOrder());
+     ClonedDVs.push_back(Clone);
+ 
+     if (InvalidateDbg) {
+       // Invalidate value and indicate the SDDbgValue should not be emitted.
+       Dbg->setIsInvalidated();
+       Dbg->setIsEmitted();
+     }
+   }
+ 
+   for (SDDbgValue *Dbg : ClonedDVs)
+     AddDbgValue(Dbg, ToNode, false);
+ }
+ 
+ void SelectionDAG::salvageDebugInfo(SDNode &N) {
+   if (!N.getHasDebugValue())
+     return;
+ 
+   SmallVector<SDDbgValue *, 2> ClonedDVs;
+   for (auto DV : GetDbgValues(&N)) {
+     if (DV->isInvalidated())
+       continue;
+     switch (N.getOpcode()) {
+     default:
+       break;
+     case ISD::ADD:
+       SDValue N0 = N.getOperand(0);
+       SDValue N1 = N.getOperand(1);
+       if (!isConstantIntBuildVectorOrConstantInt(N0) &&
+           isConstantIntBuildVectorOrConstantInt(N1)) {
+         uint64_t Offset = N.getConstantOperandVal(1);
+         // Rewrite an ADD constant node into a DIExpression. Since we are
+         // performing arithmetic to compute the variable's *value* in the
+         // DIExpression, we need to mark the expression with a
+         // DW_OP_stack_value.
+         auto *DIExpr = DV->getExpression();
+         DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref, Offset,
+                                        DIExpression::NoDeref,
+                                        DIExpression::WithStackValue);
+         SDDbgValue *Clone =
+             getDbgValue(DV->getVariable(), DIExpr, N0.getNode(), N0.getResNo(),
+                         DV->isIndirect(), DV->getDebugLoc(), DV->getOrder());
+         ClonedDVs.push_back(Clone);
+         DV->setIsInvalidated();
+         DV->setIsEmitted();
+         LLVM_DEBUG(dbgs() << "SALVAGE: Rewriting";
+                    N0.getNode()->dumprFull(this);
+                    dbgs() << " into " << *DIExpr << '\n');
+       }
+     }
+   }
+ 
+   for (SDDbgValue *Dbg : ClonedDVs)
+     AddDbgValue(Dbg, Dbg->getSDNode(), false);
+ }
+ 
+ /// Creates a SDDbgLabel node.
+ SDDbgLabel *SelectionDAG::getDbgLabel(DILabel *Label,
+                                       const DebugLoc &DL, unsigned O) {
+   assert(cast<DILabel>(Label)->isValidLocationForIntrinsic(DL) &&
+          "Expected inlined-at fields to agree");
+   return new (DbgInfo->getAlloc()) SDDbgLabel(Label, DL, O);
+ }
+ 
+ namespace {
+ 
+ /// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
+ /// pointed to by a use iterator is deleted, increment the use iterator
+ /// so that it doesn't dangle.
+ ///
+ class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
+   SDNode::use_iterator &UI;
+   SDNode::use_iterator &UE;
+ 
+   void NodeDeleted(SDNode *N, SDNode *E) override {
+     // Increment the iterator as needed.
+     while (UI != UE && N == *UI)
+       ++UI;
+   }
+ 
+ public:
+   RAUWUpdateListener(SelectionDAG &d,
+                      SDNode::use_iterator &ui,
+                      SDNode::use_iterator &ue)
+     : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
+ };
+ 
+ } // end anonymous namespace
+ 
+ /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+ /// This can cause recursive merging of nodes in the DAG.
+ ///
+ /// This version assumes From has a single result value.
+ ///
+ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
+   SDNode *From = FromN.getNode();
+   assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
+          "Cannot replace with this method!");
+   assert(From != To.getNode() && "Cannot replace uses of with self");
+ 
+   // Preserve Debug Values
+   transferDbgValues(FromN, To);
+ 
+   // Iterate over all the existing uses of From. New uses will be added
+   // to the beginning of the use list, which we avoid visiting.
+   // This specifically avoids visiting uses of From that arise while the
+   // replacement is happening, because any such uses would be the result
+   // of CSE: If an existing node looks like From after one of its operands
+   // is replaced by To, we don't want to replace of all its users with To
+   // too. See PR3018 for more info.
+   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+   RAUWUpdateListener Listener(*this, UI, UE);
+   while (UI != UE) {
+     SDNode *User = *UI;
+ 
+     // This node is about to morph, remove its old self from the CSE maps.
+     RemoveNodeFromCSEMaps(User);
+ 
+     // A user can appear in a use list multiple times, and when this
+     // happens the uses are usually next to each other in the list.
+     // To help reduce the number of CSE recomputations, process all
+     // the uses of this user that we can find this way.
+     do {
+       SDUse &Use = UI.getUse();
+       ++UI;
+       Use.set(To);
+       if (To->isDivergent() != From->isDivergent())
+         updateDivergence(User);
+     } while (UI != UE && *UI == User);
+     // Now that we have modified User, add it back to the CSE maps.  If it
+     // already exists there, recursively merge the results together.
+     AddModifiedNodeToCSEMaps(User);
+   }
+ 
+   // If we just RAUW'd the root, take note.
+   if (FromN == getRoot())
+     setRoot(To);
+ }
+ 
+ /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+ /// This can cause recursive merging of nodes in the DAG.
+ ///
+ /// This version assumes that for each value of From, there is a
+ /// corresponding value in To in the same position with the same type.
+ ///
+ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
+ #ifndef NDEBUG
+   for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+     assert((!From->hasAnyUseOfValue(i) ||
+             From->getValueType(i) == To->getValueType(i)) &&
+            "Cannot use this version of ReplaceAllUsesWith!");
+ #endif
+ 
+   // Handle the trivial case.
+   if (From == To)
+     return;
+ 
+   // Preserve Debug Info. Only do this if there's a use.
+   for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+     if (From->hasAnyUseOfValue(i)) {
+       assert((i < To->getNumValues()) && "Invalid To location");
+       transferDbgValues(SDValue(From, i), SDValue(To, i));
+     }
+ 
+   // Iterate over just the existing users of From. See the comments in
+   // the ReplaceAllUsesWith above.
+   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+   RAUWUpdateListener Listener(*this, UI, UE);
+   while (UI != UE) {
+     SDNode *User = *UI;
+ 
+     // This node is about to morph, remove its old self from the CSE maps.
+     RemoveNodeFromCSEMaps(User);
+ 
+     // A user can appear in a use list multiple times, and when this
+     // happens the uses are usually next to each other in the list.
+     // To help reduce the number of CSE recomputations, process all
+     // the uses of this user that we can find this way.
+     do {
+       SDUse &Use = UI.getUse();
+       ++UI;
+       Use.setNode(To);
+       if (To->isDivergent() != From->isDivergent())
+         updateDivergence(User);
+     } while (UI != UE && *UI == User);
+ 
+     // Now that we have modified User, add it back to the CSE maps.  If it
+     // already exists there, recursively merge the results together.
+     AddModifiedNodeToCSEMaps(User);
+   }
+ 
+   // If we just RAUW'd the root, take note.
+   if (From == getRoot().getNode())
+     setRoot(SDValue(To, getRoot().getResNo()));
+ }
+ 
+ /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+ /// This can cause recursive merging of nodes in the DAG.
+ ///
+ /// This version can replace From with any result values.  To must match the
+ /// number and types of values returned by From.
+ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
+   if (From->getNumValues() == 1)  // Handle the simple case efficiently.
+     return ReplaceAllUsesWith(SDValue(From, 0), To[0]);
+ 
+   // Preserve Debug Info.
+   for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+     transferDbgValues(SDValue(From, i), To[i]);
+ 
+   // Iterate over just the existing users of From. See the comments in
+   // the ReplaceAllUsesWith above.
+   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+   RAUWUpdateListener Listener(*this, UI, UE);
+   while (UI != UE) {
+     SDNode *User = *UI;
+ 
+     // This node is about to morph, remove its old self from the CSE maps.
+     RemoveNodeFromCSEMaps(User);
+ 
+     // A user can appear in a use list multiple times, and when this happens the
+     // uses are usually next to each other in the list.  To help reduce the
+     // number of CSE and divergence recomputations, process all the uses of this
+     // user that we can find this way.
+     bool To_IsDivergent = false;
+     do {
+       SDUse &Use = UI.getUse();
+       const SDValue &ToOp = To[Use.getResNo()];
+       ++UI;
+       Use.set(ToOp);
+       To_IsDivergent |= ToOp->isDivergent();
+     } while (UI != UE && *UI == User);
+ 
+     if (To_IsDivergent != From->isDivergent())
+       updateDivergence(User);
+ 
+     // Now that we have modified User, add it back to the CSE maps.  If it
+     // already exists there, recursively merge the results together.
+     AddModifiedNodeToCSEMaps(User);
+   }
+ 
+   // If we just RAUW'd the root, take note.
+   if (From == getRoot().getNode())
+     setRoot(SDValue(To[getRoot().getResNo()]));
+ }
+ 
+ /// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
+ /// uses of other values produced by From.getNode() alone.  The Deleted
+ /// vector is handled the same way as for ReplaceAllUsesWith.
+ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
+   // Handle the really simple, really trivial case efficiently.
+   if (From == To) return;
+ 
+   // Handle the simple, trivial, case efficiently.
+   if (From.getNode()->getNumValues() == 1) {
+     ReplaceAllUsesWith(From, To);
+     return;
+   }
+ 
+   // Preserve Debug Info.
+   transferDbgValues(From, To);
+ 
+   // Iterate over just the existing users of From. See the comments in
+   // the ReplaceAllUsesWith above.
+   SDNode::use_iterator UI = From.getNode()->use_begin(),
+                        UE = From.getNode()->use_end();
+   RAUWUpdateListener Listener(*this, UI, UE);
+   while (UI != UE) {
+     SDNode *User = *UI;
+     bool UserRemovedFromCSEMaps = false;
+ 
+     // A user can appear in a use list multiple times, and when this
+     // happens the uses are usually next to each other in the list.
+     // To help reduce the number of CSE recomputations, process all
+     // the uses of this user that we can find this way.
+     do {
+       SDUse &Use = UI.getUse();
+ 
+       // Skip uses of different values from the same node.
+       if (Use.getResNo() != From.getResNo()) {
+         ++UI;
+         continue;
+       }
+ 
+       // If this node hasn't been modified yet, it's still in the CSE maps,
+       // so remove its old self from the CSE maps.
+       if (!UserRemovedFromCSEMaps) {
+         RemoveNodeFromCSEMaps(User);
+         UserRemovedFromCSEMaps = true;
+       }
+ 
+       ++UI;
+       Use.set(To);
+       if (To->isDivergent() != From->isDivergent())
+         updateDivergence(User);
+     } while (UI != UE && *UI == User);
+     // We are iterating over all uses of the From node, so if a use
+     // doesn't use the specific value, no changes are made.
+     if (!UserRemovedFromCSEMaps)
+       continue;
+ 
+     // Now that we have modified User, add it back to the CSE maps.  If it
+     // already exists there, recursively merge the results together.
+     AddModifiedNodeToCSEMaps(User);
+   }
+ 
+   // If we just RAUW'd the root, take note.
+   if (From == getRoot())
+     setRoot(To);
+ }
+ 
+ namespace {
+ 
+   /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
+   /// to record information about a use.
+   struct UseMemo {
+     SDNode *User;
+     unsigned Index;
+     SDUse *Use;
+   };
+ 
+   /// operator< - Sort Memos by User.
+   bool operator<(const UseMemo &L, const UseMemo &R) {
+     return (intptr_t)L.User < (intptr_t)R.User;
+   }
+ 
+ } // end anonymous namespace
+ 
+ void SelectionDAG::updateDivergence(SDNode * N)
+ {
+   if (TLI->isSDNodeAlwaysUniform(N))
+     return;
+   bool IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA);
+   for (auto &Op : N->ops()) {
+     if (Op.Val.getValueType() != MVT::Other)
+       IsDivergent |= Op.getNode()->isDivergent();
+   }
+   if (N->SDNodeBits.IsDivergent != IsDivergent) {
+     N->SDNodeBits.IsDivergent = IsDivergent;
+     for (auto U : N->uses()) {
+       updateDivergence(U);
+     }
+   }
+ }
+ 
+ 
+ void SelectionDAG::CreateTopologicalOrder(std::vector<SDNode*>& Order) {
+   DenseMap<SDNode *, unsigned> Degree;
+   Order.reserve(AllNodes.size());
+   for (auto & N : allnodes()) {
+     unsigned NOps = N.getNumOperands();
+     Degree[&N] = NOps;
+     if (0 == NOps)
+       Order.push_back(&N);
+   }
+   for (std::vector<SDNode *>::iterator I = Order.begin();
+   I!=Order.end();++I) {
+     SDNode * N = *I;
+     for (auto U : N->uses()) {
+       unsigned &UnsortedOps = Degree[U];
+       if (0 == --UnsortedOps)
+         Order.push_back(U);
+     }
+   }
+ }
+ 
+ #ifndef NDEBUG
+ void SelectionDAG::VerifyDAGDiverence()
+ {
+   std::vector<SDNode*> TopoOrder;
+   CreateTopologicalOrder(TopoOrder);
+   const TargetLowering &TLI = getTargetLoweringInfo();
+   DenseMap<const SDNode *, bool> DivergenceMap;
+   for (auto &N : allnodes()) {
+     DivergenceMap[&N] = false;
+   }
+   for (auto N : TopoOrder) {
+     bool IsDivergent = DivergenceMap[N];
+     bool IsSDNodeDivergent = TLI.isSDNodeSourceOfDivergence(N, FLI, DA);
+     for (auto &Op : N->ops()) {
+       if (Op.Val.getValueType() != MVT::Other)
+         IsSDNodeDivergent |= DivergenceMap[Op.getNode()];
+     }
+     if (!IsDivergent && IsSDNodeDivergent && !TLI.isSDNodeAlwaysUniform(N)) {
+       DivergenceMap[N] = true;
+     }
+   }
+   for (auto &N : allnodes()) {
+     (void)N;
+     assert(DivergenceMap[&N] == N.isDivergent() &&
+            "Divergence bit inconsistency detected\n");
+   }
+ }
+ #endif
+ 
+ 
+ /// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
+ /// uses of other values produced by From.getNode() alone.  The same value
+ /// may appear in both the From and To list.  The Deleted vector is
+ /// handled the same way as for ReplaceAllUsesWith.
+ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
+                                               const SDValue *To,
+                                               unsigned Num){
+   // Handle the simple, trivial case efficiently.
+   if (Num == 1)
+     return ReplaceAllUsesOfValueWith(*From, *To);
+ 
+   transferDbgValues(*From, *To);
+ 
+   // Read up all the uses and make records of them. This helps
+   // processing new uses that are introduced during the
+   // replacement process.
+   SmallVector<UseMemo, 4> Uses;
+   for (unsigned i = 0; i != Num; ++i) {
+     unsigned FromResNo = From[i].getResNo();
+     SDNode *FromNode = From[i].getNode();
+     for (SDNode::use_iterator UI = FromNode->use_begin(),
+          E = FromNode->use_end(); UI != E; ++UI) {
+       SDUse &Use = UI.getUse();
+       if (Use.getResNo() == FromResNo) {
+         UseMemo Memo = { *UI, i, &Use };
+         Uses.push_back(Memo);
+       }
+     }
+   }
+ 
+   // Sort the uses, so that all the uses from a given User are together.
+   llvm::sort(Uses);
+ 
+   for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
+        UseIndex != UseIndexEnd; ) {
+     // We know that this user uses some value of From.  If it is the right
+     // value, update it.
+     SDNode *User = Uses[UseIndex].User;
+ 
+     // This node is about to morph, remove its old self from the CSE maps.
+     RemoveNodeFromCSEMaps(User);
+ 
+     // The Uses array is sorted, so all the uses for a given User
+     // are next to each other in the list.
+     // To help reduce the number of CSE recomputations, process all
+     // the uses of this user that we can find this way.
+     do {
+       unsigned i = Uses[UseIndex].Index;
+       SDUse &Use = *Uses[UseIndex].Use;
+       ++UseIndex;
+ 
+       Use.set(To[i]);
+     } while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User);
+ 
+     // Now that we have modified User, add it back to the CSE maps.  If it
+     // already exists there, recursively merge the results together.
+     AddModifiedNodeToCSEMaps(User);
+   }
+ }
+ 
+ /// AssignTopologicalOrder - Assign a unique node id for each node in the DAG
+ /// based on their topological order. It returns the maximum id and a vector
+ /// of the SDNodes* in assigned order by reference.
+ unsigned SelectionDAG::AssignTopologicalOrder() {
+   unsigned DAGSize = 0;
+ 
+   // SortedPos tracks the progress of the algorithm. Nodes before it are
+   // sorted, nodes after it are unsorted. When the algorithm completes
+   // it is at the end of the list.
+   allnodes_iterator SortedPos = allnodes_begin();
+ 
+   // Visit all the nodes. Move nodes with no operands to the front of
+   // the list immediately. Annotate nodes that do have operands with their
+   // operand count. Before we do this, the Node Id fields of the nodes
+   // may contain arbitrary values. After, the Node Id fields for nodes
+   // before SortedPos will contain the topological sort index, and the
+   // Node Id fields for nodes At SortedPos and after will contain the
+   // count of outstanding operands.
+   for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
+     SDNode *N = &*I++;
+     checkForCycles(N, this);
+     unsigned Degree = N->getNumOperands();
+     if (Degree == 0) {
+       // A node with no uses, add it to the result array immediately.
+       N->setNodeId(DAGSize++);
+       allnodes_iterator Q(N);
+       if (Q != SortedPos)
+         SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
+       assert(SortedPos != AllNodes.end() && "Overran node list");
+       ++SortedPos;
+     } else {
+       // Temporarily use the Node Id as scratch space for the degree count.
+       N->setNodeId(Degree);
+     }
+   }
+ 
+   // Visit all the nodes. As we iterate, move nodes into sorted order,
+   // such that by the time the end is reached all nodes will be sorted.
+   for (SDNode &Node : allnodes()) {
+     SDNode *N = &Node;
+     checkForCycles(N, this);
+     // N is in sorted position, so all its uses have one less operand
+     // that needs to be sorted.
+     for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+          UI != UE; ++UI) {
+       SDNode *P = *UI;
+       unsigned Degree = P->getNodeId();
+       assert(Degree != 0 && "Invalid node degree");
+       --Degree;
+       if (Degree == 0) {
+         // All of P's operands are sorted, so P may sorted now.
+         P->setNodeId(DAGSize++);
+         if (P->getIterator() != SortedPos)
+           SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
+         assert(SortedPos != AllNodes.end() && "Overran node list");
+         ++SortedPos;
+       } else {
+         // Update P's outstanding operand count.
+         P->setNodeId(Degree);
+       }
+     }
+     if (Node.getIterator() == SortedPos) {
+ #ifndef NDEBUG
+       allnodes_iterator I(N);
+       SDNode *S = &*++I;
+       dbgs() << "Overran sorted position:\n";
+       S->dumprFull(this); dbgs() << "\n";
+       dbgs() << "Checking if this is due to cycles\n";
+       checkForCycles(this, true);
+ #endif
+       llvm_unreachable(nullptr);
+     }
+   }
+ 
+   assert(SortedPos == AllNodes.end() &&
+          "Topological sort incomplete!");
+   assert(AllNodes.front().getOpcode() == ISD::EntryToken &&
+          "First node in topological sort is not the entry token!");
+   assert(AllNodes.front().getNodeId() == 0 &&
+          "First node in topological sort has non-zero id!");
+   assert(AllNodes.front().getNumOperands() == 0 &&
+          "First node in topological sort has operands!");
+   assert(AllNodes.back().getNodeId() == (int)DAGSize-1 &&
+          "Last node in topologic sort has unexpected id!");
+   assert(AllNodes.back().use_empty() &&
+          "Last node in topologic sort has users!");
+   assert(DAGSize == allnodes_size() && "Node count mismatch!");
+   return DAGSize;
+ }
+ 
+ /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
+ /// value is produced by SD.
+ void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) {
+   if (SD) {
+     assert(DbgInfo->getSDDbgValues(SD).empty() || SD->getHasDebugValue());
+     SD->setHasDebugValue(true);
+   }
+   DbgInfo->add(DB, SD, isParameter);
+ }
+ 
+ void SelectionDAG::AddDbgLabel(SDDbgLabel *DB) {
+   DbgInfo->add(DB);
+ }
+ 
+ SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
+                                                    SDValue NewMemOp) {
+   assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
+   // The new memory operation must have the same position as the old load in
+   // terms of memory dependency. Create a TokenFactor for the old load and new
+   // memory operation and update uses of the old load's output chain to use that
+   // TokenFactor.
+   SDValue OldChain = SDValue(OldLoad, 1);
+   SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
+   if (!OldLoad->hasAnyUseOfValue(1))
+     return NewChain;
+ 
+   SDValue TokenFactor =
+       getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain);
+   ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
+   UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
+   return TokenFactor;
+ }
+ 
++SDValue SelectionDAG::getSymbolFunctionGlobalAddress(SDValue Op) {
++  assert(isa<ExternalSymbolSDNode>(Op) && "Node should be an ExternalSymbol");
++
++  auto *Symbol = cast<ExternalSymbolSDNode>(Op)->getSymbol();
++  auto *Module = MF->getFunction().getParent();
++  auto *Function = Module->getFunction(Symbol);
++
++  if (Function != nullptr) {
++    auto PtrTy = TLI->getPointerTy(getDataLayout());
++    return getGlobalAddress(Function, SDLoc(Op), PtrTy);
++  }
++
++  std::string ErrorStr;
++  raw_string_ostream ErrorFormatter(ErrorStr);
++
++  ErrorFormatter << "Undefined external symbol ";
++  ErrorFormatter << '"' << Symbol << '"';
++  ErrorFormatter.flush();
++
++  report_fatal_error(ErrorStr);
++}
++
+ //===----------------------------------------------------------------------===//
+ //                              SDNode Class
+ //===----------------------------------------------------------------------===//
+ 
+ bool llvm::isNullConstant(SDValue V) {
+   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+   return Const != nullptr && Const->isNullValue();
+ }
+ 
+ bool llvm::isNullFPConstant(SDValue V) {
+   ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V);
+   return Const != nullptr && Const->isZero() && !Const->isNegative();
+ }
+ 
+ bool llvm::isAllOnesConstant(SDValue V) {
+   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+   return Const != nullptr && Const->isAllOnesValue();
+ }
+ 
+ bool llvm::isOneConstant(SDValue V) {
+   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+   return Const != nullptr && Const->isOne();
+ }
+ 
+ SDValue llvm::peekThroughBitcasts(SDValue V) {
+   while (V.getOpcode() == ISD::BITCAST)
+     V = V.getOperand(0);
+   return V;
+ }
+ 
+ SDValue llvm::peekThroughOneUseBitcasts(SDValue V) {
+   while (V.getOpcode() == ISD::BITCAST && V.getOperand(0).hasOneUse())
+     V = V.getOperand(0);
+   return V;
+ }
+ 
+ bool llvm::isBitwiseNot(SDValue V) {
+   if (V.getOpcode() != ISD::XOR)
+     return false;
+   ConstantSDNode *C = isConstOrConstSplat(peekThroughBitcasts(V.getOperand(1)));
+   return C && C->isAllOnesValue();
+ }
+ 
+ ConstantSDNode *llvm::isConstOrConstSplat(SDValue N, bool AllowUndefs) {
+   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
+     return CN;
+ 
+   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
+     BitVector UndefElements;
+     ConstantSDNode *CN = BV->getConstantSplatNode(&UndefElements);
+ 
+     // BuildVectors can truncate their operands. Ignore that case here.
+     if (CN && (UndefElements.none() || AllowUndefs) &&
+         CN->getValueType(0) == N.getValueType().getScalarType())
+       return CN;
+   }
+ 
+   return nullptr;
+ }
+ 
+ ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N, bool AllowUndefs) {
+   if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+     return CN;
+ 
+   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
+     BitVector UndefElements;
+     ConstantFPSDNode *CN = BV->getConstantFPSplatNode(&UndefElements);
+     if (CN && (UndefElements.none() || AllowUndefs))
+       return CN;
+   }
+ 
+   return nullptr;
+ }
+ 
+ bool llvm::isNullOrNullSplat(SDValue N) {
+   // TODO: may want to use peekThroughBitcast() here.
+   ConstantSDNode *C = isConstOrConstSplat(N);
+   return C && C->isNullValue();
+ }
+ 
+ bool llvm::isOneOrOneSplat(SDValue N) {
+   // TODO: may want to use peekThroughBitcast() here.
+   unsigned BitWidth = N.getScalarValueSizeInBits();
+   ConstantSDNode *C = isConstOrConstSplat(N);
+   return C && C->isOne() && C->getValueSizeInBits(0) == BitWidth;
+ }
+ 
+ bool llvm::isAllOnesOrAllOnesSplat(SDValue N) {
+   N = peekThroughBitcasts(N);
+   unsigned BitWidth = N.getScalarValueSizeInBits();
+   ConstantSDNode *C = isConstOrConstSplat(N);
+   return C && C->isAllOnesValue() && C->getValueSizeInBits(0) == BitWidth;
+ }
+ 
+ HandleSDNode::~HandleSDNode() {
+   DropOperands();
+ }
+ 
+ GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
+                                          const DebugLoc &DL,
+                                          const GlobalValue *GA, EVT VT,
+                                          int64_t o, unsigned char TF)
+     : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
+   TheGlobal = GA;
+ }
+ 
+ AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl,
+                                          EVT VT, unsigned SrcAS,
+                                          unsigned DestAS)
+     : SDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT)),
+       SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}
+ 
+ MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
+                      SDVTList VTs, EVT memvt, MachineMemOperand *mmo)
+     : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
+   MemSDNodeBits.IsVolatile = MMO->isVolatile();
+   MemSDNodeBits.IsNonTemporal = MMO->isNonTemporal();
+   MemSDNodeBits.IsDereferenceable = MMO->isDereferenceable();
+   MemSDNodeBits.IsInvariant = MMO->isInvariant();
+ 
+   // We check here that the size of the memory operand fits within the size of
+   // the MMO. This is because the MMO might indicate only a possible address
+   // range instead of specifying the affected memory addresses precisely.
+   assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
+ }
+ 
+ /// Profile - Gather unique data for the node.
+ ///
+ void SDNode::Profile(FoldingSetNodeID &ID) const {
+   AddNodeIDNode(ID, this);
+ }
+ 
+ namespace {
+ 
+   struct EVTArray {
+     std::vector<EVT> VTs;
+ 
+     EVTArray() {
+       VTs.reserve(MVT::LAST_VALUETYPE);
+       for (unsigned i = 0; i < MVT::LAST_VALUETYPE; ++i)
+         VTs.push_back(MVT((MVT::SimpleValueType)i));
+     }
+   };
+ 
+ } // end anonymous namespace
+ 
+ static ManagedStatic<std::set<EVT, EVT::compareRawBits>> EVTs;
+ static ManagedStatic<EVTArray> SimpleVTArray;
+ static ManagedStatic<sys::SmartMutex<true>> VTMutex;
+ 
+ /// getValueTypeList - Return a pointer to the specified value type.
+ ///
+ const EVT *SDNode::getValueTypeList(EVT VT) {
+   if (VT.isExtended()) {
+     sys::SmartScopedLock<true> Lock(*VTMutex);
+     return &(*EVTs->insert(VT).first);
+   } else {
+     assert(VT.getSimpleVT() < MVT::LAST_VALUETYPE &&
+            "Value type out of range!");
+     return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy];
+   }
+ }
+ 
+ /// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
+ /// indicated value.  This method ignores uses of other values defined by this
+ /// operation.
+ bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
+   assert(Value < getNumValues() && "Bad value!");
+ 
+   // TODO: Only iterate over uses of a given value of the node
+   for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
+     if (UI.getUse().getResNo() == Value) {
+       if (NUses == 0)
+         return false;
+       --NUses;
+     }
+   }
+ 
+   // Found exactly the right number of uses?
+   return NUses == 0;
+ }
+ 
+ /// hasAnyUseOfValue - Return true if there are any use of the indicated
+ /// value. This method ignores uses of other values defined by this operation.
+ bool SDNode::hasAnyUseOfValue(unsigned Value) const {
+   assert(Value < getNumValues() && "Bad value!");
+ 
+   for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
+     if (UI.getUse().getResNo() == Value)
+       return true;
+ 
+   return false;
+ }
+ 
+ /// isOnlyUserOf - Return true if this node is the only use of N.
+ bool SDNode::isOnlyUserOf(const SDNode *N) const {
+   bool Seen = false;
+   for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+     SDNode *User = *I;
+     if (User == this)
+       Seen = true;
+     else
+       return false;
+   }
+ 
+   return Seen;
+ }
+ 
+ /// Return true if the only users of N are contained in Nodes.
+ bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
+   bool Seen = false;
+   for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+     SDNode *User = *I;
+     if (llvm::any_of(Nodes,
+                      [&User](const SDNode *Node) { return User == Node; }))
+       Seen = true;
+     else
+       return false;
+   }
+ 
+   return Seen;
+ }
+ 
+ /// isOperand - Return true if this node is an operand of N.
+ bool SDValue::isOperandOf(const SDNode *N) const {
+   for (const SDValue &Op : N->op_values())
+     if (*this == Op)
+       return true;
+   return false;
+ }
+ 
+ bool SDNode::isOperandOf(const SDNode *N) const {
+   for (const SDValue &Op : N->op_values())
+     if (this == Op.getNode())
+       return true;
+   return false;
+ }
+ 
+ /// reachesChainWithoutSideEffects - Return true if this operand (which must
+ /// be a chain) reaches the specified operand without crossing any
+ /// side-effecting instructions on any chain path.  In practice, this looks
+ /// through token factors and non-volatile loads.  In order to remain efficient,
+ /// this only looks a couple of nodes in, it does not do an exhaustive search.
+ ///
+ /// Note that we only need to examine chains when we're searching for
+ /// side-effects; SelectionDAG requires that all side-effects are represented
+ /// by chains, even if another operand would force a specific ordering. This
+ /// constraint is necessary to allow transformations like splitting loads.
+ bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
+                                              unsigned Depth) const {
+   if (*this == Dest) return true;
+ 
+   // Don't search too deeply, we just want to be able to see through
+   // TokenFactor's etc.
+   if (Depth == 0) return false;
+ 
+   // If this is a token factor, all inputs to the TF happen in parallel.
+   if (getOpcode() == ISD::TokenFactor) {
+     // First, try a shallow search.
+     if (is_contained((*this)->ops(), Dest)) {
+       // We found the chain we want as an operand of this TokenFactor.
+       // Essentially, we reach the chain without side-effects if we could
+       // serialize the TokenFactor into a simple chain of operations with
+       // Dest as the last operation. This is automatically true if the
+       // chain has one use: there are no other ordering constraints.
+       // If the chain has more than one use, we give up: some other
+       // use of Dest might force a side-effect between Dest and the current
+       // node.
+       if (Dest.hasOneUse())
+         return true;
+     }
+     // Next, try a deep search: check whether every operand of the TokenFactor
+     // reaches Dest.
+     return llvm::all_of((*this)->ops(), [=](SDValue Op) {
+       return Op.reachesChainWithoutSideEffects(Dest, Depth - 1);
+     });
+   }
+ 
+   // Loads don't have side effects, look through them.
+   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(*this)) {
+     if (!Ld->isVolatile())
+       return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
+   }
+   return false;
+ }
+ 
+ bool SDNode::hasPredecessor(const SDNode *N) const {
+   SmallPtrSet<const SDNode *, 32> Visited;
+   SmallVector<const SDNode *, 16> Worklist;
+   Worklist.push_back(this);
+   return hasPredecessorHelper(N, Visited, Worklist);
+ }
+ 
+ void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {
+   this->Flags.intersectWith(Flags);
+ }
+ 
+ SDValue
+ SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
+                                   ArrayRef<ISD::NodeType> CandidateBinOps) {
+   // The pattern must end in an extract from index 0.
+   if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+       !isNullConstant(Extract->getOperand(1)))
+     return SDValue();
+ 
+   SDValue Op = Extract->getOperand(0);
+   unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
+ 
+   // Match against one of the candidate binary ops.
+   if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
+         return Op.getOpcode() == unsigned(BinOp);
+       }))
+     return SDValue();
+ 
+   // At each stage, we're looking for something that looks like:
+   // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
+   //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
+   //                               i32 undef, i32 undef, i32 undef, i32 undef>
+   // %a = binop <8 x i32> %op, %s
+   // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
+   // we expect something like:
+   // <4,5,6,7,u,u,u,u>
+   // <2,3,u,u,u,u,u,u>
+   // <1,u,u,u,u,u,u,u>
+   unsigned CandidateBinOp = Op.getOpcode();
+   for (unsigned i = 0; i < Stages; ++i) {
+     if (Op.getOpcode() != CandidateBinOp)
+       return SDValue();
+ 
+     SDValue Op0 = Op.getOperand(0);
+     SDValue Op1 = Op.getOperand(1);
+ 
+     ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Op0);
+     if (Shuffle) {
+       Op = Op1;
+     } else {
+       Shuffle = dyn_cast<ShuffleVectorSDNode>(Op1);
+       Op = Op0;
+     }
+ 
+     // The first operand of the shuffle should be the same as the other operand
+     // of the binop.
+     if (!Shuffle || Shuffle->getOperand(0) != Op)
+       return SDValue();
+ 
+     // Verify the shuffle has the expected (at this stage of the pyramid) mask.
+     for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
+       if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
+         return SDValue();
+   }
+ 
+   BinOp = (ISD::NodeType)CandidateBinOp;
+   return Op;
+ }
+ 
+ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
+   assert(N->getNumValues() == 1 &&
+          "Can't unroll a vector with multiple results!");
+ 
+   EVT VT = N->getValueType(0);
+   unsigned NE = VT.getVectorNumElements();
+   EVT EltVT = VT.getVectorElementType();
+   SDLoc dl(N);
+ 
+   SmallVector<SDValue, 8> Scalars;
+   SmallVector<SDValue, 4> Operands(N->getNumOperands());
+ 
+   // If ResNE is 0, fully unroll the vector op.
+   if (ResNE == 0)
+     ResNE = NE;
+   else if (NE > ResNE)
+     NE = ResNE;
+ 
+   unsigned i;
+   for (i= 0; i != NE; ++i) {
+     for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) {
+       SDValue Operand = N->getOperand(j);
+       EVT OperandVT = Operand.getValueType();
+       if (OperandVT.isVector()) {
+         // A vector operand; extract a single element.
+         EVT OperandEltVT = OperandVT.getVectorElementType();
+         Operands[j] =
+             getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand,
+                     getConstant(i, dl, TLI->getVectorIdxTy(getDataLayout())));
+       } else {
+         // A scalar operand; just use it as is.
+         Operands[j] = Operand;
+       }
+     }
+ 
+     switch (N->getOpcode()) {
+     default: {
+       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands,
+                                 N->getFlags()));
+       break;
+     }
+     case ISD::VSELECT:
+       Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands));
+       break;
+     case ISD::SHL:
+     case ISD::SRA:
+     case ISD::SRL:
+     case ISD::ROTL:
+     case ISD::ROTR:
+       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
+                                getShiftAmountOperand(Operands[0].getValueType(),
+                                                      Operands[1])));
+       break;
+     case ISD::SIGN_EXTEND_INREG:
+     case ISD::FP_ROUND_INREG: {
+       EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
+       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
+                                 Operands[0],
+                                 getValueType(ExtVT)));
+     }
+     }
+   }
+ 
+   for (; i < ResNE; ++i)
+     Scalars.push_back(getUNDEF(EltVT));
+ 
+   EVT VecVT = EVT::getVectorVT(*getContext(), EltVT, ResNE);
+   return getBuildVector(VecVT, dl, Scalars);
+ }
+ 
+ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
+                                                   LoadSDNode *Base,
+                                                   unsigned Bytes,
+                                                   int Dist) const {
+   if (LD->isVolatile() || Base->isVolatile())
+     return false;
+   if (LD->isIndexed() || Base->isIndexed())
+     return false;
+   if (LD->getChain() != Base->getChain())
+     return false;
+   EVT VT = LD->getValueType(0);
+   if (VT.getSizeInBits() / 8 != Bytes)
+     return false;
+ 
+   auto BaseLocDecomp = BaseIndexOffset::match(Base, *this);
+   auto LocDecomp = BaseIndexOffset::match(LD, *this);
+ 
+   int64_t Offset = 0;
+   if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
+     return (Dist * Bytes == Offset);
+   return false;
+ }
+ 
+ /// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
+ /// it cannot be inferred.
+ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
+   // If this is a GlobalAddress + cst, return the alignment.
+   const GlobalValue *GV;
+   int64_t GVOffset = 0;
+   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
+     unsigned IdxWidth = getDataLayout().getIndexTypeSizeInBits(GV->getType());
+     KnownBits Known(IdxWidth);
+     llvm::computeKnownBits(GV, Known, getDataLayout());
+     unsigned AlignBits = Known.countMinTrailingZeros();
+     unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
+     if (Align)
+       return MinAlign(Align, GVOffset);
+   }
+ 
+   // If this is a direct reference to a stack slot, use information about the
+   // stack slot's alignment.
+   int FrameIdx = 1 << 31;
+   int64_t FrameOffset = 0;
+   if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
+     FrameIdx = FI->getIndex();
+   } else if (isBaseWithConstantOffset(Ptr) &&
+              isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+     // Handle FI+Cst
+     FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+     FrameOffset = Ptr.getConstantOperandVal(1);
+   }
+ 
+   if (FrameIdx != (1 << 31)) {
+     const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
+     unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
+                                     FrameOffset);
+     return FIInfoAlign;
+   }
+ 
+   return 0;
+ }
+ 
+ /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+ /// which is split (or expanded) into two not necessarily identical pieces.
+ std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
+   // Currently all types are split in half.
+   EVT LoVT, HiVT;
+   if (!VT.isVector())
+     LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
+   else
+     LoVT = HiVT = VT.getHalfNumVectorElementsVT(*getContext());
+ 
+   return std::make_pair(LoVT, HiVT);
+ }
+ 
+ /// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
+ /// low/high part.
+ std::pair<SDValue, SDValue>
+ SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
+                           const EVT &HiVT) {
+   assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <=
+          N.getValueType().getVectorNumElements() &&
+          "More vector elements requested than available!");
+   SDValue Lo, Hi;
+   Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
+                getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
+   Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
+                getConstant(LoVT.getVectorNumElements(), DL,
+                            TLI->getVectorIdxTy(getDataLayout())));
+   return std::make_pair(Lo, Hi);
+ }
+ 
+ void SelectionDAG::ExtractVectorElements(SDValue Op,
+                                          SmallVectorImpl<SDValue> &Args,
+                                          unsigned Start, unsigned Count) {
+   EVT VT = Op.getValueType();
+   if (Count == 0)
+     Count = VT.getVectorNumElements();
+ 
+   EVT EltVT = VT.getVectorElementType();
+   EVT IdxTy = TLI->getVectorIdxTy(getDataLayout());
+   SDLoc SL(Op);
+   for (unsigned i = Start, e = Start + Count; i != e; ++i) {
+     Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                            Op, getConstant(i, SL, IdxTy)));
+   }
+ }
+ 
+ // getAddressSpace - Return the address space this GlobalAddress belongs to.
+ unsigned GlobalAddressSDNode::getAddressSpace() const {
+   return getGlobal()->getType()->getAddressSpace();
+ }
+ 
+ Type *ConstantPoolSDNode::getType() const {
+   if (isMachineConstantPoolEntry())
+     return Val.MachineCPVal->getType();
+   return Val.ConstVal->getType();
+ }
+ 
+ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
+                                         unsigned &SplatBitSize,
+                                         bool &HasAnyUndefs,
+                                         unsigned MinSplatBits,
+                                         bool IsBigEndian) const {
+   EVT VT = getValueType(0);
+   assert(VT.isVector() && "Expected a vector type");
+   unsigned VecWidth = VT.getSizeInBits();
+   if (MinSplatBits > VecWidth)
+     return false;
+ 
+   // FIXME: The widths are based on this node's type, but build vectors can
+   // truncate their operands.
+   SplatValue = APInt(VecWidth, 0);
+   SplatUndef = APInt(VecWidth, 0);
+ 
+   // Get the bits. Bits with undefined values (when the corresponding element
+   // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
+   // in SplatValue. If any of the values are not constant, give up and return
+   // false.
+   unsigned int NumOps = getNumOperands();
+   assert(NumOps > 0 && "isConstantSplat has 0-size build vector");
+   unsigned EltWidth = VT.getScalarSizeInBits();
+ 
+   for (unsigned j = 0; j < NumOps; ++j) {
+     unsigned i = IsBigEndian ? NumOps - 1 - j : j;
+     SDValue OpVal = getOperand(i);
+     unsigned BitPos = j * EltWidth;
+ 
+     if (OpVal.isUndef())
+       SplatUndef.setBits(BitPos, BitPos + EltWidth);
+     else if (auto *CN = dyn_cast<ConstantSDNode>(OpVal))
+       SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos);
+     else if (auto *CN = dyn_cast<ConstantFPSDNode>(OpVal))
+       SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos);
+     else
+       return false;
+   }
+ 
+   // The build_vector is all constants or undefs. Find the smallest element
+   // size that splats the vector.
+   HasAnyUndefs = (SplatUndef != 0);
+ 
+   // FIXME: This does not work for vectors with elements less than 8 bits.
+   while (VecWidth > 8) {
+     unsigned HalfSize = VecWidth / 2;
+     APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize);
+     APInt LowValue = SplatValue.trunc(HalfSize);
+     APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize);
+     APInt LowUndef = SplatUndef.trunc(HalfSize);
+ 
+     // If the two halves do not match (ignoring undef bits), stop here.
+     if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) ||
+         MinSplatBits > HalfSize)
+       break;
+ 
+     SplatValue = HighValue | LowValue;
+     SplatUndef = HighUndef & LowUndef;
+ 
+     VecWidth = HalfSize;
+   }
+ 
+   SplatBitSize = VecWidth;
+   return true;
+ }
+ 
+ SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
+   if (UndefElements) {
+     UndefElements->clear();
+     UndefElements->resize(getNumOperands());
+   }
+   SDValue Splatted;
+   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+     SDValue Op = getOperand(i);
+     if (Op.isUndef()) {
+       if (UndefElements)
+         (*UndefElements)[i] = true;
+     } else if (!Splatted) {
+       Splatted = Op;
+     } else if (Splatted != Op) {
+       return SDValue();
+     }
+   }
+ 
+   if (!Splatted) {
+     assert(getOperand(0).isUndef() &&
+            "Can only have a splat without a constant for all undefs.");
+     return getOperand(0);
+   }
+ 
+   return Splatted;
+ }
+ 
+ ConstantSDNode *
+ BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const {
+   return dyn_cast_or_null<ConstantSDNode>(getSplatValue(UndefElements));
+ }
+ 
+ ConstantFPSDNode *
+ BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const {
+   return dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements));
+ }
+ 
+ int32_t
+ BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
+                                                    uint32_t BitWidth) const {
+   if (ConstantFPSDNode *CN =
+           dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements))) {
+     bool IsExact;
+     APSInt IntVal(BitWidth);
+     const APFloat &APF = CN->getValueAPF();
+     if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
+             APFloat::opOK ||
+         !IsExact)
+       return -1;
+ 
+     return IntVal.exactLogBase2();
+   }
+   return -1;
+ }
+ 
+ bool BuildVectorSDNode::isConstant() const {
+   for (const SDValue &Op : op_values()) {
+     unsigned Opc = Op.getOpcode();
+     if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP)
+       return false;
+   }
+   return true;
+ }
+ 
+ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
+   // Find the first non-undef value in the shuffle mask.
+   unsigned i, e;
+   for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i)
+     /* search */;
+ 
+   assert(i != e && "VECTOR_SHUFFLE node with all undef indices!");
+ 
+   // Make sure all remaining elements are either undef or the same as the first
+   // non-undef value.
+   for (int Idx = Mask[i]; i != e; ++i)
+     if (Mask[i] >= 0 && Mask[i] != Idx)
+       return false;
+   return true;
+ }
+ 
+ // Returns the SDNode if it is a constant integer BuildVector
+ // or constant integer.
+ SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) {
+   if (isa<ConstantSDNode>(N))
+     return N.getNode();
+   if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
+     return N.getNode();
+   // Treat a GlobalAddress supporting constant offset folding as a
+   // constant integer.
+   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N))
+     if (GA->getOpcode() == ISD::GlobalAddress &&
+         TLI->isOffsetFoldingLegal(GA))
+       return GA;
+   return nullptr;
+ }
+ 
+ SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) {
+   if (isa<ConstantFPSDNode>(N))
+     return N.getNode();
+ 
+   if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
+     return N.getNode();
+ 
+   return nullptr;
+ }
+ 
+ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
+   assert(!Node->OperandList && "Node already has operands");
+   assert(std::numeric_limits<decltype(SDNode::NumOperands)>::max() >
+              Vals.size() &&
+          "too many operands to fit into SDNode");
+   SDUse *Ops = OperandRecycler.allocate(
+       ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);
+ 
+   bool IsDivergent = false;
+   for (unsigned I = 0; I != Vals.size(); ++I) {
+     Ops[I].setUser(Node);
+     Ops[I].setInitial(Vals[I]);
+     if (Ops[I].Val.getValueType() != MVT::Other) // Skip Chain. It does not carry divergence.
+       IsDivergent = IsDivergent || Ops[I].getNode()->isDivergent();
+   }
+   Node->NumOperands = Vals.size();
+   Node->OperandList = Ops;
+   IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, DA);
+   if (!TLI->isSDNodeAlwaysUniform(Node))
+     Node->SDNodeBits.IsDivergent = IsDivergent;
+   checkForCycles(Node);
+ }
+ 
+ #ifndef NDEBUG
+ static void checkForCyclesHelper(const SDNode *N,
+                                  SmallPtrSetImpl<const SDNode*> &Visited,
+                                  SmallPtrSetImpl<const SDNode*> &Checked,
+                                  const llvm::SelectionDAG *DAG) {
+   // If this node has already been checked, don't check it again.
+   if (Checked.count(N))
+     return;
+ 
+   // If a node has already been visited on this depth-first walk, reject it as
+   // a cycle.
+   if (!Visited.insert(N).second) {
+     errs() << "Detected cycle in SelectionDAG\n";
+     dbgs() << "Offending node:\n";
+     N->dumprFull(DAG); dbgs() << "\n";
+     abort();
+   }
+ 
+   for (const SDValue &Op : N->op_values())
+     checkForCyclesHelper(Op.getNode(), Visited, Checked, DAG);
+ 
+   Checked.insert(N);
+   Visited.erase(N);
+ }
+ #endif
+ 
+ void llvm::checkForCycles(const llvm::SDNode *N,
+                           const llvm::SelectionDAG *DAG,
+                           bool force) {
+ #ifndef NDEBUG
+   bool check = force;
+ #ifdef EXPENSIVE_CHECKS
+   check = true;
+ #endif  // EXPENSIVE_CHECKS
+   if (check) {
+     assert(N && "Checking nonexistent SDNode");
+     SmallPtrSet<const SDNode*, 32> visited;
+     SmallPtrSet<const SDNode*, 32> checked;
+     checkForCyclesHelper(N, visited, checked, DAG);
+   }
+ #endif  // !NDEBUG
+ }
+ 
+ void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) {
+   checkForCycles(DAG->getRoot().getNode(), DAG, force);
+ }
+diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
+index 4a64fe0961e..d094620f1bf 100644
+--- a/lib/Target/NVPTX/CMakeLists.txt
++++ b/lib/Target/NVPTX/CMakeLists.txt
+@@ -1,41 +1,42 @@
+ set(LLVM_TARGET_DEFINITIONS NVPTX.td)
+ 
+ tablegen(LLVM NVPTXGenAsmWriter.inc -gen-asm-writer)
+ tablegen(LLVM NVPTXGenDAGISel.inc -gen-dag-isel)
+ tablegen(LLVM NVPTXGenInstrInfo.inc -gen-instr-info)
+ tablegen(LLVM NVPTXGenRegisterInfo.inc -gen-register-info)
+ tablegen(LLVM NVPTXGenSubtargetInfo.inc -gen-subtarget)
+ 
+ add_public_tablegen_target(NVPTXCommonTableGen)
+ 
+ set(NVPTXCodeGen_sources
+   NVPTXAllocaHoisting.cpp
+   NVPTXAsmPrinter.cpp
+   NVPTXAssignValidGlobalNames.cpp
+   NVPTXFrameLowering.cpp
+   NVPTXGenericToNVVM.cpp
+   NVPTXISelDAGToDAG.cpp
+   NVPTXISelLowering.cpp
+   NVPTXImageOptimizer.cpp
+   NVPTXInstrInfo.cpp
+   NVPTXLowerAggrCopies.cpp
+   NVPTXLowerArgs.cpp
+   NVPTXLowerAlloca.cpp
+   NVPTXPeephole.cpp
+   NVPTXMCExpr.cpp
+   NVPTXPrologEpilogPass.cpp
+   NVPTXRegisterInfo.cpp
+   NVPTXReplaceImageHandles.cpp
+   NVPTXSubtarget.cpp
+   NVPTXTargetMachine.cpp
+   NVPTXTargetTransformInfo.cpp
+   NVPTXUtilities.cpp
+   NVVMIntrRange.cpp
+   NVVMReflect.cpp
++  NVPTXProxyRegErasure.cpp
+   )
+ 
+ add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
+ 
+ add_subdirectory(InstPrinter)
+ add_subdirectory(MCTargetDesc)
+ add_subdirectory(TargetInfo)
+diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
+index 02b8d8fff64..07bfc58a8da 100644
+--- a/lib/Target/NVPTX/NVPTX.h
++++ b/lib/Target/NVPTX/NVPTX.h
+@@ -1,177 +1,178 @@
+ //===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===//
+ //
+ //                     The LLVM Compiler Infrastructure
+ //
+ // This file is distributed under the University of Illinois Open Source
+ // License. See LICENSE.TXT for details.
+ //
+ //===----------------------------------------------------------------------===//
+ //
+ // This file contains the entry points for global functions defined in
+ // the LLVM NVPTX back-end.
+ //
+ //===----------------------------------------------------------------------===//
+ 
+ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTX_H
+ #define LLVM_LIB_TARGET_NVPTX_NVPTX_H
+ 
+ #include "MCTargetDesc/NVPTXBaseInfo.h"
+ #include "llvm/ADT/StringMap.h"
+ #include "llvm/IR/Module.h"
+ #include "llvm/IR/Value.h"
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Target/TargetMachine.h"
+ #include <cassert>
+ #include <iosfwd>
+ 
+ namespace llvm {
+ class NVPTXTargetMachine;
+ class FunctionPass;
+ class MachineFunctionPass;
+ class formatted_raw_ostream;
+ 
+ namespace NVPTXCC {
+ enum CondCodes {
+   EQ,
+   NE,
+   LT,
+   LE,
+   GT,
+   GE
+ };
+ }
+ 
+ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                  llvm::CodeGenOpt::Level OptLevel);
+ ModulePass *createNVPTXAssignValidGlobalNamesPass();
+ ModulePass *createGenericToNVVMPass();
+ FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
+ FunctionPass *createNVVMReflectPass(unsigned int SmVersion);
+ MachineFunctionPass *createNVPTXPrologEpilogPass();
+ MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
+ FunctionPass *createNVPTXImageOptimizerPass();
+ FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM);
+ BasicBlockPass *createNVPTXLowerAllocaPass();
+ MachineFunctionPass *createNVPTXPeephole();
++MachineFunctionPass *createNVPTXProxyRegErasurePass();
+ 
+ Target &getTheNVPTXTarget32();
+ Target &getTheNVPTXTarget64();
+ 
+ namespace NVPTX {
+ enum DrvInterface {
+   NVCL,
+   CUDA
+ };
+ 
+ // A field inside TSFlags needs a shift and a mask. The usage is
+ // always as follows :
+ // ((TSFlags & fieldMask) >> fieldShift)
+ // The enum keeps the mask, the shift, and all valid values of the
+ // field in one place.
+ enum VecInstType {
+   VecInstTypeShift = 0,
+   VecInstTypeMask = 0xF,
+ 
+   VecNOP = 0,
+   VecLoad = 1,
+   VecStore = 2,
+   VecBuild = 3,
+   VecShuffle = 4,
+   VecExtract = 5,
+   VecInsert = 6,
+   VecDest = 7,
+   VecOther = 15
+ };
+ 
+ enum SimpleMove {
+   SimpleMoveMask = 0x10,
+   SimpleMoveShift = 4
+ };
+ enum LoadStore {
+   isLoadMask = 0x20,
+   isLoadShift = 5,
+   isStoreMask = 0x40,
+   isStoreShift = 6
+ };
+ 
+ namespace PTXLdStInstCode {
+ enum AddressSpace {
+   GENERIC = 0,
+   GLOBAL = 1,
+   CONSTANT = 2,
+   SHARED = 3,
+   PARAM = 4,
+   LOCAL = 5
+ };
+ enum FromType {
+   Unsigned = 0,
+   Signed,
+   Float,
+   Untyped
+ };
+ enum VecType {
+   Scalar = 1,
+   V2 = 2,
+   V4 = 4
+ };
+ }
+ 
+ /// PTXCvtMode - Conversion code enumeration
+ namespace PTXCvtMode {
+ enum CvtMode {
+   NONE = 0,
+   RNI,
+   RZI,
+   RMI,
+   RPI,
+   RN,
+   RZ,
+   RM,
+   RP,
+ 
+   BASE_MASK = 0x0F,
+   FTZ_FLAG = 0x10,
+   SAT_FLAG = 0x20
+ };
+ }
+ 
+ /// PTXCmpMode - Comparison mode enumeration
+ namespace PTXCmpMode {
+ enum CmpMode {
+   EQ = 0,
+   NE,
+   LT,
+   LE,
+   GT,
+   GE,
+   LO,
+   LS,
+   HI,
+   HS,
+   EQU,
+   NEU,
+   LTU,
+   LEU,
+   GTU,
+   GEU,
+   NUM,
+   // NAN is a MACRO
+   NotANumber,
+ 
+   BASE_MASK = 0xFF,
+   FTZ_FLAG = 0x100
+ };
+ }
+ }
+ } // end namespace llvm;
+ 
+ // Defines symbolic names for NVPTX registers.  This defines a mapping from
+ // register name to register number.
+ #define GET_REGINFO_ENUM
+ #include "NVPTXGenRegisterInfo.inc"
+ 
+ // Defines symbolic names for the NVPTX instructions.
+ #define GET_INSTRINFO_ENUM
+ #include "NVPTXGenInstrInfo.inc"
+ 
+ #endif
+diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
+index 5c16c34e21d..fcca43d54b3 100644
+--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
++++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
+@@ -1,4761 +1,4795 @@
+ //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
+ //
+ //                     The LLVM Compiler Infrastructure
+ //
+ // This file is distributed under the University of Illinois Open Source
+ // License. See LICENSE.TXT for details.
+ //
+ //===----------------------------------------------------------------------===//
+ //
+ // This file defines the interfaces that NVPTX uses to lower LLVM code into a
+ // selection DAG.
+ //
+ //===----------------------------------------------------------------------===//
+ 
+ #include "NVPTXISelLowering.h"
+ #include "MCTargetDesc/NVPTXBaseInfo.h"
+ #include "NVPTX.h"
+ #include "NVPTXSubtarget.h"
+ #include "NVPTXTargetMachine.h"
+ #include "NVPTXTargetObjectFile.h"
+ #include "NVPTXUtilities.h"
+ #include "llvm/ADT/APInt.h"
+ #include "llvm/ADT/SmallVector.h"
+ #include "llvm/ADT/StringRef.h"
+ #include "llvm/CodeGen/Analysis.h"
+ #include "llvm/CodeGen/MachineFunction.h"
+ #include "llvm/CodeGen/MachineMemOperand.h"
+ #include "llvm/CodeGen/SelectionDAG.h"
+ #include "llvm/CodeGen/SelectionDAGNodes.h"
+ #include "llvm/CodeGen/TargetCallingConv.h"
+ #include "llvm/CodeGen/TargetLowering.h"
+ #include "llvm/CodeGen/ValueTypes.h"
+ #include "llvm/IR/Argument.h"
+ #include "llvm/IR/Attributes.h"
+ #include "llvm/IR/CallSite.h"
+ #include "llvm/IR/Constants.h"
+ #include "llvm/IR/DataLayout.h"
+ #include "llvm/IR/DerivedTypes.h"
+ #include "llvm/IR/Function.h"
+ #include "llvm/IR/GlobalValue.h"
+ #include "llvm/IR/Instruction.h"
+ #include "llvm/IR/Instructions.h"
+ #include "llvm/IR/Module.h"
+ #include "llvm/IR/Type.h"
+ #include "llvm/IR/Value.h"
+ #include "llvm/Support/Casting.h"
+ #include "llvm/Support/CodeGen.h"
+ #include "llvm/Support/CommandLine.h"
+ #include "llvm/Support/ErrorHandling.h"
+ #include "llvm/Support/MachineValueType.h"
+ #include "llvm/Support/MathExtras.h"
+ #include "llvm/Support/raw_ostream.h"
+ #include "llvm/Target/TargetMachine.h"
+ #include "llvm/Target/TargetOptions.h"
+ #include <algorithm>
+ #include <cassert>
+ #include <cstdint>
+ #include <iterator>
+ #include <sstream>
+ #include <string>
+ #include <utility>
+ #include <vector>
+ 
+ #define DEBUG_TYPE "nvptx-lower"
+ 
+ using namespace llvm;
+ 
+ static unsigned int uniqueCallSite = 0;
+ 
+ static cl::opt<bool> sched4reg(
+     "nvptx-sched4reg",
+     cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
+ 
+ static cl::opt<unsigned>
+ FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
+                     cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+                              " 1: do it  2: do it aggressively"),
+                     cl::init(2));
+ 
+ static cl::opt<int> UsePrecDivF32(
+     "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
+     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
+              " IEEE Compliant F32 div.rnd if available."),
+     cl::init(2));
+ 
+ static cl::opt<bool> UsePrecSqrtF32(
+     "nvptx-prec-sqrtf32", cl::Hidden,
+     cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
+     cl::init(true));
+ 
+ static cl::opt<bool> FtzEnabled(
+     "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
+     cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
+     cl::init(false));
+ 
+ int NVPTXTargetLowering::getDivF32Level() const {
+   if (UsePrecDivF32.getNumOccurrences() > 0) {
+     // If nvptx-prec-div32=N is used on the command-line, always honor it
+     return UsePrecDivF32;
+   } else {
+     // Otherwise, use div.approx if fast math is enabled
+     if (getTargetMachine().Options.UnsafeFPMath)
+       return 0;
+     else
+       return 2;
+   }
+ }
+ 
+ bool NVPTXTargetLowering::usePrecSqrtF32() const {
+   if (UsePrecSqrtF32.getNumOccurrences() > 0) {
+     // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+     return UsePrecSqrtF32;
+   } else {
+     // Otherwise, use sqrt.approx if fast math is enabled
+     return !getTargetMachine().Options.UnsafeFPMath;
+   }
+ }
+ 
+ bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
+   // TODO: Get rid of this flag; there can be only one way to do this.
+   if (FtzEnabled.getNumOccurrences() > 0) {
+     // If nvptx-f32ftz is used on the command-line, always honor it
+     return FtzEnabled;
+   } else {
+     const Function &F = MF.getFunction();
+     // Otherwise, check for an nvptx-f32ftz attribute on the function
+     if (F.hasFnAttribute("nvptx-f32ftz"))
+       return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
+     else
+       return false;
+   }
+ }
+ 
+ static bool IsPTXVectorType(MVT VT) {
+   switch (VT.SimpleTy) {
+   default:
+     return false;
+   case MVT::v2i1:
+   case MVT::v4i1:
+   case MVT::v2i8:
+   case MVT::v4i8:
+   case MVT::v2i16:
+   case MVT::v4i16:
+   case MVT::v2i32:
+   case MVT::v4i32:
+   case MVT::v2i64:
+   case MVT::v2f16:
+   case MVT::v4f16:
+   case MVT::v8f16: // <4 x f16x2>
+   case MVT::v2f32:
+   case MVT::v4f32:
+   case MVT::v2f64:
+     return true;
+   }
+ }
+ 
+ /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
+ /// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
+ /// into their primitive components.
+ /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
+ /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
+ /// LowerCall, and LowerReturn.
+ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                                Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                                SmallVectorImpl<uint64_t> *Offsets = nullptr,
+                                uint64_t StartingOffset = 0) {
+   SmallVector<EVT, 16> TempVTs;
+   SmallVector<uint64_t, 16> TempOffsets;
+ 
+   // Special case for i128 - decompose to (i64, i64)
+   if (Ty->isIntegerTy(128)) {
+     ValueVTs.push_back(EVT(MVT::i64));
+     ValueVTs.push_back(EVT(MVT::i64));
+ 
+     if (Offsets) {
+       Offsets->push_back(StartingOffset + 0);
+       Offsets->push_back(StartingOffset + 8);
+     }
+ 
+     return;
+   }
+ 
+   // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
+   if (StructType *STy = dyn_cast<StructType>(Ty)) {
+     auto const *SL = DL.getStructLayout(STy);
+     auto ElementNum = 0;
+     for(auto *EI : STy->elements()) {
+       ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
+                          StartingOffset + SL->getElementOffset(ElementNum));
+       ++ElementNum;
+     }
+     return;
+   }
+ 
+   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
+   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
+     EVT VT = TempVTs[i];
+     uint64_t Off = TempOffsets[i];
+     // Split vectors into individual elements, except for v2f16, which
+     // we will pass as a single scalar.
+     if (VT.isVector()) {
+       unsigned NumElts = VT.getVectorNumElements();
+       EVT EltVT = VT.getVectorElementType();
+       // Vectors with an even number of f16 elements will be passed to
+       // us as an array of v2f16 elements. We must match this so we
+       // stay in sync with Ins/Outs.
+       if (EltVT == MVT::f16 && NumElts % 2 == 0) {
+         EltVT = MVT::v2f16;
+         NumElts /= 2;
+       }
+       for (unsigned j = 0; j != NumElts; ++j) {
+         ValueVTs.push_back(EltVT);
+         if (Offsets)
+           Offsets->push_back(Off + j * EltVT.getStoreSize());
+       }
+     } else {
+       ValueVTs.push_back(VT);
+       if (Offsets)
+         Offsets->push_back(Off);
+     }
+   }
+ }
+ 
+ // Check whether we can merge loads/stores of some of the pieces of a
+ // flattened function parameter or return value into a single vector
+ // load/store.
+ //
+ // The flattened parameter is represented as a list of EVTs and
+ // offsets, and the whole structure is aligned to ParamAlignment. This
+ // function determines whether we can load/store pieces of the
+ // parameter starting at index Idx using a single vectorized op of
+ // size AccessSize. If so, it returns the number of param pieces
+ // covered by the vector op. Otherwise, it returns 1.
+ static unsigned CanMergeParamLoadStoresStartingAt(
+     unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
+     const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
+   assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
+ 
+   // Can't vectorize if param alignment is not sufficient.
+   if (AccessSize > ParamAlignment)
+     return 1;
+   // Can't vectorize if offset is not aligned.
+   if (Offsets[Idx] & (AccessSize - 1))
+     return 1;
+ 
+   EVT EltVT = ValueVTs[Idx];
+   unsigned EltSize = EltVT.getStoreSize();
+ 
+   // Element is too large to vectorize.
+   if (EltSize >= AccessSize)
+     return 1;
+ 
+   unsigned NumElts = AccessSize / EltSize;
+   // Can't vectorize if AccessBytes if not a multiple of EltSize.
+   if (AccessSize != EltSize * NumElts)
+     return 1;
+ 
+   // We don't have enough elements to vectorize.
+   if (Idx + NumElts > ValueVTs.size())
+     return 1;
+ 
+   // PTX ISA can only deal with 2- and 4-element vector ops.
+   if (NumElts != 4 && NumElts != 2)
+     return 1;
+ 
+   for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
+     // Types do not match.
+     if (ValueVTs[j] != EltVT)
+       return 1;
+ 
+     // Elements are not contiguous.
+     if (Offsets[j] - Offsets[j - 1] != EltSize)
+       return 1;
+   }
+   // OK. We can vectorize ValueVTs[i..i+NumElts)
+   return NumElts;
+ }
+ 
+ // Flags for tracking per-element vectorization state of loads/stores
+ // of a flattened function parameter or return value.
+ enum ParamVectorizationFlags {
+   PVF_INNER = 0x0, // Middle elements of a vector.
+   PVF_FIRST = 0x1, // First element of the vector.
+   PVF_LAST = 0x2,  // Last element of the vector.
+   // Scalar is effectively a 1-element vector.
+   PVF_SCALAR = PVF_FIRST | PVF_LAST
+ };
+ 
+ // Computes whether and how we can vectorize the loads/stores of a
+ // flattened function parameter or return value.
+ //
+ // The flattened parameter is represented as the list of ValueVTs and
+ // Offsets, and is aligned to ParamAlignment bytes. We return a vector
+ // of the same size as ValueVTs indicating how each piece should be
+ // loaded/stored (i.e. as a scalar, or as part of a vector
+ // load/store).
+ static SmallVector<ParamVectorizationFlags, 16>
+ VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
+                      const SmallVectorImpl<uint64_t> &Offsets,
+                      unsigned ParamAlignment) {
+   // Set vector size to match ValueVTs and mark all elements as
+   // scalars by default.
+   SmallVector<ParamVectorizationFlags, 16> VectorInfo;
+   VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
+ 
+   // Check what we can vectorize using 128/64/32-bit accesses.
+   for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
+     // Skip elements we've already processed.
+     assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
+     for (unsigned AccessSize : {16, 8, 4, 2}) {
+       unsigned NumElts = CanMergeParamLoadStoresStartingAt(
+           I, AccessSize, ValueVTs, Offsets, ParamAlignment);
+       // Mark vectorized elements.
+       switch (NumElts) {
+       default:
+         llvm_unreachable("Unexpected return value");
+       case 1:
+         // Can't vectorize using this size, try next smaller size.
+         continue;
+       case 2:
+         assert(I + 1 < E && "Not enough elements.");
+         VectorInfo[I] = PVF_FIRST;
+         VectorInfo[I + 1] = PVF_LAST;
+         I += 1;
+         break;
+       case 4:
+         assert(I + 3 < E && "Not enough elements.");
+         VectorInfo[I] = PVF_FIRST;
+         VectorInfo[I + 1] = PVF_INNER;
+         VectorInfo[I + 2] = PVF_INNER;
+         VectorInfo[I + 3] = PVF_LAST;
+         I += 3;
+         break;
+       }
+       // Break out of the inner loop because we've already succeeded
+       // using largest possible AccessSize.
+       break;
+     }
+   }
+   return VectorInfo;
+ }
+ 
+ // NVPTXTargetLowering Constructor.
+ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
+                                          const NVPTXSubtarget &STI)
+     : TargetLowering(TM), nvTM(&TM), STI(STI) {
+   // always lower memset, memcpy, and memmove intrinsics to load/store
+   // instructions, rather
+   // then generating calls to memset, mempcy or memmove.
+   MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
+   MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
+   MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
+ 
+   setBooleanContents(ZeroOrNegativeOneBooleanContent);
+   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+ 
+   // Jump is Expensive. Don't create extra control flow for 'and', 'or'
+   // condition branches.
+   setJumpIsExpensive(true);
+ 
+   // Wide divides are _very_ slow. Try to reduce the width of the divide if
+   // possible.
+   addBypassSlowDiv(64, 32);
+ 
+   // By default, use the Source scheduling
+   if (sched4reg)
+     setSchedulingPreference(Sched::RegPressure);
+   else
+     setSchedulingPreference(Sched::Source);
+ 
+   auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
+                                     LegalizeAction NoF16Action) {
+     setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
+   };
+ 
+   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
+   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
+   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
+   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
+   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
+   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
+   addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
+   addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
+ 
+   // Conversion to/from FP16/FP16x2 is always legal.
+   setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
+   setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
+   setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
+   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
+   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
+ 
+   setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
+   setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
+ 
+   // Operations not directly supported by NVPTX.
+   for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8,
+                  MVT::i16, MVT::i32, MVT::i64}) {
+     setOperationAction(ISD::SELECT_CC, VT, Expand);
+     setOperationAction(ISD::BR_CC, VT, Expand);
+   }
+ 
+   // Some SIGN_EXTEND_INREG can be done using cvt instruction.
+   // For others we will expand to a SHL/SRA pair.
+   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
+   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+ 
+   setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
+   setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
+   setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
+   setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
+   setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
+   setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
+ 
+   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+ 
+   // TODO: we may consider expanding ROTL/ROTR on older GPUs.  Currently on GPUs
+   // that don't have h/w rotation we lower them to multi-instruction assembly.
+   // See ROT*_sw in NVPTXIntrInfo.td
+   setOperationAction(ISD::ROTL, MVT::i64, Legal);
+   setOperationAction(ISD::ROTR, MVT::i64, Legal);
+   setOperationAction(ISD::ROTL, MVT::i32, Legal);
+   setOperationAction(ISD::ROTR, MVT::i32, Legal);
+ 
+   setOperationAction(ISD::ROTL, MVT::i16, Expand);
+   setOperationAction(ISD::ROTR, MVT::i16, Expand);
+   setOperationAction(ISD::ROTL, MVT::i8, Expand);
+   setOperationAction(ISD::ROTR, MVT::i8, Expand);
+   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
+   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+   setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+ 
+   // Indirect branch is not supported.
+   // This also disables Jump Table creation.
+   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+   setOperationAction(ISD::BRIND, MVT::Other, Expand);
+ 
+   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+ 
+   // We want to legalize constant related memmove and memcopy
+   // intrinsics.
+   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ 
+   // Turn FP extload into load/fpextend
+   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
+   // Turn FP truncstore into trunc + store.
+   // FIXME: vector types should also be expanded
+   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ 
+   // PTX does not support load / store predicate registers
+   setOperationAction(ISD::LOAD, MVT::i1, Custom);
+   setOperationAction(ISD::STORE, MVT::i1, Custom);
+ 
+   for (MVT VT : MVT::integer_valuetypes()) {
+     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+     setTruncStoreAction(VT, MVT::i1, Expand);
+   }
+ 
+   // This is legal in NVPTX
+   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+   setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
+ 
+   // TRAP can be lowered to PTX trap
+   setOperationAction(ISD::TRAP, MVT::Other, Legal);
+ 
+   // Register custom handling for vector loads/stores
+   for (MVT VT : MVT::vector_valuetypes()) {
+     if (IsPTXVectorType(VT)) {
+       setOperationAction(ISD::LOAD, VT, Custom);
+       setOperationAction(ISD::STORE, VT, Custom);
+       setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+     }
+   }
+ 
+   // Custom handling for i8 intrinsics
+   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+ 
+   for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
+     setOperationAction(ISD::ABS,  Ty, Legal);
+     setOperationAction(ISD::SMIN, Ty, Legal);
+     setOperationAction(ISD::SMAX, Ty, Legal);
+     setOperationAction(ISD::UMIN, Ty, Legal);
+     setOperationAction(ISD::UMAX, Ty, Legal);
+ 
+     setOperationAction(ISD::CTPOP, Ty, Legal);
+     setOperationAction(ISD::CTLZ, Ty, Legal);
+   }
+ 
+   setOperationAction(ISD::CTTZ, MVT::i16, Expand);
+   setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+   setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+ 
+   // PTX does not directly support SELP of i1, so promote to i32 first
+   setOperationAction(ISD::SELECT, MVT::i1, Custom);
+ 
+   // PTX cannot multiply two i64s in a single instruction.
+   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+ 
+   // We have some custom DAG combine patterns for these nodes
+   setTargetDAGCombine(ISD::ADD);
+   setTargetDAGCombine(ISD::AND);
+   setTargetDAGCombine(ISD::FADD);
+   setTargetDAGCombine(ISD::MUL);
+   setTargetDAGCombine(ISD::SHL);
+   setTargetDAGCombine(ISD::SREM);
+   setTargetDAGCombine(ISD::UREM);
+ 
+   // setcc for f16x2 needs special handling to prevent legalizer's
+   // attempt to scalarize it due to v2i1 not being legal.
+   if (STI.allowFP16Math())
+     setTargetDAGCombine(ISD::SETCC);
+ 
+   // Promote fp16 arithmetic if fp16 hardware isn't available or the
+   // user passed --nvptx-no-fp16-math. The flag is useful because,
+   // although sm_53+ GPUs have some sort of FP16 support in
+   // hardware, only sm_53 and sm_60 have full implementation. Others
+   // only have token amount of hardware and are likely to run faster
+   // by using fp32 units instead.
+   for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
+     setFP16OperationAction(Op, MVT::f16, Legal, Promote);
+     setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
+   }
+ 
+   // There's no neg.f16 instruction. Expand to (0-x).
+   setOperationAction(ISD::FNEG, MVT::f16, Expand);
+   setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
+ 
+   // (would be) Library functions.
+ 
+   // These map to conversion instructions for scalar FP types.
+   for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
+                          ISD::FROUND, ISD::FTRUNC}) {
+     setOperationAction(Op, MVT::f16, Legal);
+     setOperationAction(Op, MVT::f32, Legal);
+     setOperationAction(Op, MVT::f64, Legal);
+     setOperationAction(Op, MVT::v2f16, Expand);
+   }
+ 
+   // 'Expand' implements FCOPYSIGN without calling an external library.
+   setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+   setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
+   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ 
+   // These map to corresponding instructions for f32/f64. f16 must be
+   // promoted to f32. v2f16 is expanded to f16, which is then promoted
+   // to f32.
+   for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
+                          ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) {
+     setOperationAction(Op, MVT::f16, Promote);
+     setOperationAction(Op, MVT::f32, Legal);
+     setOperationAction(Op, MVT::f64, Legal);
+     setOperationAction(Op, MVT::v2f16, Expand);
+   }
+   setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
+   setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
+   setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
+   setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
+ 
+   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
+   // No FPOW or FREM in PTX.
+ 
+   // Now deduce the information based on the above mentioned
+   // actions
+   computeRegisterProperties(STI.getRegisterInfo());
+ }
+ 
+ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+   switch ((NVPTXISD::NodeType)Opcode) {
+   case NVPTXISD::FIRST_NUMBER:
+     break;
+   case NVPTXISD::CALL:
+     return "NVPTXISD::CALL";
+   case NVPTXISD::RET_FLAG:
+     return "NVPTXISD::RET_FLAG";
+   case NVPTXISD::LOAD_PARAM:
+     return "NVPTXISD::LOAD_PARAM";
+   case NVPTXISD::Wrapper:
+     return "NVPTXISD::Wrapper";
+   case NVPTXISD::DeclareParam:
+     return "NVPTXISD::DeclareParam";
+   case NVPTXISD::DeclareScalarParam:
+     return "NVPTXISD::DeclareScalarParam";
+   case NVPTXISD::DeclareRet:
+     return "NVPTXISD::DeclareRet";
+   case NVPTXISD::DeclareScalarRet:
+     return "NVPTXISD::DeclareScalarRet";
+   case NVPTXISD::DeclareRetParam:
+     return "NVPTXISD::DeclareRetParam";
+   case NVPTXISD::PrintCall:
+     return "NVPTXISD::PrintCall";
+   case NVPTXISD::PrintConvergentCall:
+     return "NVPTXISD::PrintConvergentCall";
+   case NVPTXISD::PrintCallUni:
+     return "NVPTXISD::PrintCallUni";
+   case NVPTXISD::PrintConvergentCallUni:
+     return "NVPTXISD::PrintConvergentCallUni";
+   case NVPTXISD::LoadParam:
+     return "NVPTXISD::LoadParam";
+   case NVPTXISD::LoadParamV2:
+     return "NVPTXISD::LoadParamV2";
+   case NVPTXISD::LoadParamV4:
+     return "NVPTXISD::LoadParamV4";
+   case NVPTXISD::StoreParam:
+     return "NVPTXISD::StoreParam";
+   case NVPTXISD::StoreParamV2:
+     return "NVPTXISD::StoreParamV2";
+   case NVPTXISD::StoreParamV4:
+     return "NVPTXISD::StoreParamV4";
+   case NVPTXISD::StoreParamS32:
+     return "NVPTXISD::StoreParamS32";
+   case NVPTXISD::StoreParamU32:
+     return "NVPTXISD::StoreParamU32";
+   case NVPTXISD::CallArgBegin:
+     return "NVPTXISD::CallArgBegin";
+   case NVPTXISD::CallArg:
+     return "NVPTXISD::CallArg";
+   case NVPTXISD::LastCallArg:
+     return "NVPTXISD::LastCallArg";
+   case NVPTXISD::CallArgEnd:
+     return "NVPTXISD::CallArgEnd";
+   case NVPTXISD::CallVoid:
+     return "NVPTXISD::CallVoid";
+   case NVPTXISD::CallVal:
+     return "NVPTXISD::CallVal";
+   case NVPTXISD::CallSymbol:
+     return "NVPTXISD::CallSymbol";
+   case NVPTXISD::Prototype:
+     return "NVPTXISD::Prototype";
+   case NVPTXISD::MoveParam:
+     return "NVPTXISD::MoveParam";
+   case NVPTXISD::StoreRetval:
+     return "NVPTXISD::StoreRetval";
+   case NVPTXISD::StoreRetvalV2:
+     return "NVPTXISD::StoreRetvalV2";
+   case NVPTXISD::StoreRetvalV4:
+     return "NVPTXISD::StoreRetvalV4";
+   case NVPTXISD::PseudoUseParam:
+     return "NVPTXISD::PseudoUseParam";
+   case NVPTXISD::RETURN:
+     return "NVPTXISD::RETURN";
+   case NVPTXISD::CallSeqBegin:
+     return "NVPTXISD::CallSeqBegin";
+   case NVPTXISD::CallSeqEnd:
+     return "NVPTXISD::CallSeqEnd";
+   case NVPTXISD::CallPrototype:
+     return "NVPTXISD::CallPrototype";
++  case NVPTXISD::ProxyReg:
++    return "NVPTXISD::ProxyReg";
+   case NVPTXISD::LoadV2:
+     return "NVPTXISD::LoadV2";
+   case NVPTXISD::LoadV4:
+     return "NVPTXISD::LoadV4";
+   case NVPTXISD::LDGV2:
+     return "NVPTXISD::LDGV2";
+   case NVPTXISD::LDGV4:
+     return "NVPTXISD::LDGV4";
+   case NVPTXISD::LDUV2:
+     return "NVPTXISD::LDUV2";
+   case NVPTXISD::LDUV4:
+     return "NVPTXISD::LDUV4";
+   case NVPTXISD::StoreV2:
+     return "NVPTXISD::StoreV2";
+   case NVPTXISD::StoreV4:
+     return "NVPTXISD::StoreV4";
+   case NVPTXISD::FUN_SHFL_CLAMP:
+     return "NVPTXISD::FUN_SHFL_CLAMP";
+   case NVPTXISD::FUN_SHFR_CLAMP:
+     return "NVPTXISD::FUN_SHFR_CLAMP";
+   case NVPTXISD::IMAD:
+     return "NVPTXISD::IMAD";
+   case NVPTXISD::SETP_F16X2:
+     return "NVPTXISD::SETP_F16X2";
+   case NVPTXISD::Dummy:
+     return "NVPTXISD::Dummy";
+   case NVPTXISD::MUL_WIDE_SIGNED:
+     return "NVPTXISD::MUL_WIDE_SIGNED";
+   case NVPTXISD::MUL_WIDE_UNSIGNED:
+     return "NVPTXISD::MUL_WIDE_UNSIGNED";
+   case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
+   case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
+   case NVPTXISD::Tex1DFloatFloatLevel:
+     return "NVPTXISD::Tex1DFloatFloatLevel";
+   case NVPTXISD::Tex1DFloatFloatGrad:
+     return "NVPTXISD::Tex1DFloatFloatGrad";
+   case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
+   case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
+   case NVPTXISD::Tex1DS32FloatLevel:
+     return "NVPTXISD::Tex1DS32FloatLevel";
+   case NVPTXISD::Tex1DS32FloatGrad:
+     return "NVPTXISD::Tex1DS32FloatGrad";
+   case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
+   case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
+   case NVPTXISD::Tex1DU32FloatLevel:
+     return "NVPTXISD::Tex1DU32FloatLevel";
+   case NVPTXISD::Tex1DU32FloatGrad:
+     return "NVPTXISD::Tex1DU32FloatGrad";
+   case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
+   case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
+   case NVPTXISD::Tex1DArrayFloatFloatLevel:
+     return "NVPTXISD::Tex1DArrayFloatFloatLevel";
+   case NVPTXISD::Tex1DArrayFloatFloatGrad:
+     return "NVPTXISD::Tex1DArrayFloatFloatGrad";
+   case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
+   case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
+   case NVPTXISD::Tex1DArrayS32FloatLevel:
+     return "NVPTXISD::Tex1DArrayS32FloatLevel";
+   case NVPTXISD::Tex1DArrayS32FloatGrad:
+     return "NVPTXISD::Tex1DArrayS32FloatGrad";
+   case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
+   case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
+   case NVPTXISD::Tex1DArrayU32FloatLevel:
+     return "NVPTXISD::Tex1DArrayU32FloatLevel";
+   case NVPTXISD::Tex1DArrayU32FloatGrad:
+     return "NVPTXISD::Tex1DArrayU32FloatGrad";
+   case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
+   case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
+   case NVPTXISD::Tex2DFloatFloatLevel:
+     return "NVPTXISD::Tex2DFloatFloatLevel";
+   case NVPTXISD::Tex2DFloatFloatGrad:
+     return "NVPTXISD::Tex2DFloatFloatGrad";
+   case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
+   case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
+   case NVPTXISD::Tex2DS32FloatLevel:
+     return "NVPTXISD::Tex2DS32FloatLevel";
+   case NVPTXISD::Tex2DS32FloatGrad:
+     return "NVPTXISD::Tex2DS32FloatGrad";
+   case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
+   case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
+   case NVPTXISD::Tex2DU32FloatLevel:
+     return "NVPTXISD::Tex2DU32FloatLevel";
+   case NVPTXISD::Tex2DU32FloatGrad:
+     return "NVPTXISD::Tex2DU32FloatGrad";
+   case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
+   case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
+   case NVPTXISD::Tex2DArrayFloatFloatLevel:
+     return "NVPTXISD::Tex2DArrayFloatFloatLevel";
+   case NVPTXISD::Tex2DArrayFloatFloatGrad:
+     return "NVPTXISD::Tex2DArrayFloatFloatGrad";
+   case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
+   case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
+   case NVPTXISD::Tex2DArrayS32FloatLevel:
+     return "NVPTXISD::Tex2DArrayS32FloatLevel";
+   case NVPTXISD::Tex2DArrayS32FloatGrad:
+     return "NVPTXISD::Tex2DArrayS32FloatGrad";
+   case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
+   case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
+   case NVPTXISD::Tex2DArrayU32FloatLevel:
+     return "NVPTXISD::Tex2DArrayU32FloatLevel";
+   case NVPTXISD::Tex2DArrayU32FloatGrad:
+     return "NVPTXISD::Tex2DArrayU32FloatGrad";
+   case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
+   case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
+   case NVPTXISD::Tex3DFloatFloatLevel:
+     return "NVPTXISD::Tex3DFloatFloatLevel";
+   case NVPTXISD::Tex3DFloatFloatGrad:
+     return "NVPTXISD::Tex3DFloatFloatGrad";
+   case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
+   case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
+   case NVPTXISD::Tex3DS32FloatLevel:
+     return "NVPTXISD::Tex3DS32FloatLevel";
+   case NVPTXISD::Tex3DS32FloatGrad:
+     return "NVPTXISD::Tex3DS32FloatGrad";
+   case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
+   case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
+   case NVPTXISD::Tex3DU32FloatLevel:
+     return "NVPTXISD::Tex3DU32FloatLevel";
+   case NVPTXISD::Tex3DU32FloatGrad:
+     return "NVPTXISD::Tex3DU32FloatGrad";
+   case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
+   case NVPTXISD::TexCubeFloatFloatLevel:
+     return "NVPTXISD::TexCubeFloatFloatLevel";
+   case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
+   case NVPTXISD::TexCubeS32FloatLevel:
+     return "NVPTXISD::TexCubeS32FloatLevel";
+   case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
+   case NVPTXISD::TexCubeU32FloatLevel:
+     return "NVPTXISD::TexCubeU32FloatLevel";
+   case NVPTXISD::TexCubeArrayFloatFloat:
+     return "NVPTXISD::TexCubeArrayFloatFloat";
+   case NVPTXISD::TexCubeArrayFloatFloatLevel:
+     return "NVPTXISD::TexCubeArrayFloatFloatLevel";
+   case NVPTXISD::TexCubeArrayS32Float:
+     return "NVPTXISD::TexCubeArrayS32Float";
+   case NVPTXISD::TexCubeArrayS32FloatLevel:
+     return "NVPTXISD::TexCubeArrayS32FloatLevel";
+   case NVPTXISD::TexCubeArrayU32Float:
+     return "NVPTXISD::TexCubeArrayU32Float";
+   case NVPTXISD::TexCubeArrayU32FloatLevel:
+     return "NVPTXISD::TexCubeArrayU32FloatLevel";
+   case NVPTXISD::Tld4R2DFloatFloat:
+     return "NVPTXISD::Tld4R2DFloatFloat";
+   case NVPTXISD::Tld4G2DFloatFloat:
+     return "NVPTXISD::Tld4G2DFloatFloat";
+   case NVPTXISD::Tld4B2DFloatFloat:
+     return "NVPTXISD::Tld4B2DFloatFloat";
+   case NVPTXISD::Tld4A2DFloatFloat:
+     return "NVPTXISD::Tld4A2DFloatFloat";
+   case NVPTXISD::Tld4R2DS64Float:
+     return "NVPTXISD::Tld4R2DS64Float";
+   case NVPTXISD::Tld4G2DS64Float:
+     return "NVPTXISD::Tld4G2DS64Float";
+   case NVPTXISD::Tld4B2DS64Float:
+     return "NVPTXISD::Tld4B2DS64Float";
+   case NVPTXISD::Tld4A2DS64Float:
+     return "NVPTXISD::Tld4A2DS64Float";
+   case NVPTXISD::Tld4R2DU64Float:
+     return "NVPTXISD::Tld4R2DU64Float";
+   case NVPTXISD::Tld4G2DU64Float:
+     return "NVPTXISD::Tld4G2DU64Float";
+   case NVPTXISD::Tld4B2DU64Float:
+     return "NVPTXISD::Tld4B2DU64Float";
+   case NVPTXISD::Tld4A2DU64Float:
+     return "NVPTXISD::Tld4A2DU64Float";
+ 
+   case NVPTXISD::TexUnified1DFloatS32:
+     return "NVPTXISD::TexUnified1DFloatS32";
+   case NVPTXISD::TexUnified1DFloatFloat:
+     return "NVPTXISD::TexUnified1DFloatFloat";
+   case NVPTXISD::TexUnified1DFloatFloatLevel:
+     return "NVPTXISD::TexUnified1DFloatFloatLevel";
+   case NVPTXISD::TexUnified1DFloatFloatGrad:
+     return "NVPTXISD::TexUnified1DFloatFloatGrad";
+   case NVPTXISD::TexUnified1DS32S32:
+     return "NVPTXISD::TexUnified1DS32S32";
+   case NVPTXISD::TexUnified1DS32Float:
+     return "NVPTXISD::TexUnified1DS32Float";
+   case NVPTXISD::TexUnified1DS32FloatLevel:
+     return "NVPTXISD::TexUnified1DS32FloatLevel";
+   case NVPTXISD::TexUnified1DS32FloatGrad:
+     return "NVPTXISD::TexUnified1DS32FloatGrad";
+   case NVPTXISD::TexUnified1DU32S32:
+     return "NVPTXISD::TexUnified1DU32S32";
+   case NVPTXISD::TexUnified1DU32Float:
+     return "NVPTXISD::TexUnified1DU32Float";
+   case NVPTXISD::TexUnified1DU32FloatLevel:
+     return "NVPTXISD::TexUnified1DU32FloatLevel";
+   case NVPTXISD::TexUnified1DU32FloatGrad:
+     return "NVPTXISD::TexUnified1DU32FloatGrad";
+   case NVPTXISD::TexUnified1DArrayFloatS32:
+     return "NVPTXISD::TexUnified1DArrayFloatS32";
+   case NVPTXISD::TexUnified1DArrayFloatFloat:
+     return "NVPTXISD::TexUnified1DArrayFloatFloat";
+   case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+     return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
+   case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+     return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
+   case NVPTXISD::TexUnified1DArrayS32S32:
+     return "NVPTXISD::TexUnified1DArrayS32S32";
+   case NVPTXISD::TexUnified1DArrayS32Float:
+     return "NVPTXISD::TexUnified1DArrayS32Float";
+   case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+     return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
+   case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+     return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
+   case NVPTXISD::TexUnified1DArrayU32S32:
+     return "NVPTXISD::TexUnified1DArrayU32S32";
+   case NVPTXISD::TexUnified1DArrayU32Float:
+     return "NVPTXISD::TexUnified1DArrayU32Float";
+   case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+     return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
+   case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+     return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
+   case NVPTXISD::TexUnified2DFloatS32:
+     return "NVPTXISD::TexUnified2DFloatS32";
+   case NVPTXISD::TexUnified2DFloatFloat:
+     return "NVPTXISD::TexUnified2DFloatFloat";
+   case NVPTXISD::TexUnified2DFloatFloatLevel:
+     return "NVPTXISD::TexUnified2DFloatFloatLevel";
+   case NVPTXISD::TexUnified2DFloatFloatGrad:
+     return "NVPTXISD::TexUnified2DFloatFloatGrad";
+   case NVPTXISD::TexUnified2DS32S32:
+     return "NVPTXISD::TexUnified2DS32S32";
+   case NVPTXISD::TexUnified2DS32Float:
+     return "NVPTXISD::TexUnified2DS32Float";
+   case NVPTXISD::TexUnified2DS32FloatLevel:
+     return "NVPTXISD::TexUnified2DS32FloatLevel";
+   case NVPTXISD::TexUnified2DS32FloatGrad:
+     return "NVPTXISD::TexUnified2DS32FloatGrad";
+   case NVPTXISD::TexUnified2DU32S32:
+     return "NVPTXISD::TexUnified2DU32S32";
+   case NVPTXISD::TexUnified2DU32Float:
+     return "NVPTXISD::TexUnified2DU32Float";
+   case NVPTXISD::TexUnified2DU32FloatLevel:
+     return "NVPTXISD::TexUnified2DU32FloatLevel";
+   case NVPTXISD::TexUnified2DU32FloatGrad:
+     return "NVPTXISD::TexUnified2DU32FloatGrad";
+   case NVPTXISD::TexUnified2DArrayFloatS32:
+     return "NVPTXISD::TexUnified2DArrayFloatS32";
+   case NVPTXISD::TexUnified2DArrayFloatFloat:
+     return "NVPTXISD::TexUnified2DArrayFloatFloat";
+   case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+     return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
+   case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+     return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
+   case NVPTXISD::TexUnified2DArrayS32S32:
+     return "NVPTXISD::TexUnified2DArrayS32S32";
+   case NVPTXISD::TexUnified2DArrayS32Float:
+     return "NVPTXISD::TexUnified2DArrayS32Float";
+   case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+     return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
+   case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+     return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
+   case NVPTXISD::TexUnified2DArrayU32S32:
+     return "NVPTXISD::TexUnified2DArrayU32S32";
+   case NVPTXISD::TexUnified2DArrayU32Float:
+     return "NVPTXISD::TexUnified2DArrayU32Float";
+   case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+     return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
+   case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+     return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
+   case NVPTXISD::TexUnified3DFloatS32:
+     return "NVPTXISD::TexUnified3DFloatS32";
+   case NVPTXISD::TexUnified3DFloatFloat:
+     return "NVPTXISD::TexUnified3DFloatFloat";
+   case NVPTXISD::TexUnified3DFloatFloatLevel:
+     return "NVPTXISD::TexUnified3DFloatFloatLevel";
+   case NVPTXISD::TexUnified3DFloatFloatGrad:
+     return "NVPTXISD::TexUnified3DFloatFloatGrad";
+   case NVPTXISD::TexUnified3DS32S32:
+     return "NVPTXISD::TexUnified3DS32S32";
+   case NVPTXISD::TexUnified3DS32Float:
+     return "NVPTXISD::TexUnified3DS32Float";
+   case NVPTXISD::TexUnified3DS32FloatLevel:
+     return "NVPTXISD::TexUnified3DS32FloatLevel";
+   case NVPTXISD::TexUnified3DS32FloatGrad:
+     return "NVPTXISD::TexUnified3DS32FloatGrad";
+   case NVPTXISD::TexUnified3DU32S32:
+     return "NVPTXISD::TexUnified3DU32S32";
+   case NVPTXISD::TexUnified3DU32Float:
+     return "NVPTXISD::TexUnified3DU32Float";
+   case NVPTXISD::TexUnified3DU32FloatLevel:
+     return "NVPTXISD::TexUnified3DU32FloatLevel";
+   case NVPTXISD::TexUnified3DU32FloatGrad:
+     return "NVPTXISD::TexUnified3DU32FloatGrad";
+   case NVPTXISD::TexUnifiedCubeFloatFloat:
+     return "NVPTXISD::TexUnifiedCubeFloatFloat";
+   case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+     return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
+   case NVPTXISD::TexUnifiedCubeS32Float:
+     return "NVPTXISD::TexUnifiedCubeS32Float";
+   case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+     return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
+   case NVPTXISD::TexUnifiedCubeU32Float:
+     return "NVPTXISD::TexUnifiedCubeU32Float";
+   case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+     return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
+   case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+     return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
+   case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+     return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
+   case NVPTXISD::TexUnifiedCubeArrayS32Float:
+     return "NVPTXISD::TexUnifiedCubeArrayS32Float";
+   case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+     return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
+   case NVPTXISD::TexUnifiedCubeArrayU32Float:
+     return "NVPTXISD::TexUnifiedCubeArrayU32Float";
+   case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+     return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
+   case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+     return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
+   case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+     return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
+   case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+     return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
+   case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+     return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
+   case NVPTXISD::Tld4UnifiedR2DS64Float:
+     return "NVPTXISD::Tld4UnifiedR2DS64Float";
+   case NVPTXISD::Tld4UnifiedG2DS64Float:
+     return "NVPTXISD::Tld4UnifiedG2DS64Float";
+   case NVPTXISD::Tld4UnifiedB2DS64Float:
+     return "NVPTXISD::Tld4UnifiedB2DS64Float";
+   case NVPTXISD::Tld4UnifiedA2DS64Float:
+     return "NVPTXISD::Tld4UnifiedA2DS64Float";
+   case NVPTXISD::Tld4UnifiedR2DU64Float:
+     return "NVPTXISD::Tld4UnifiedR2DU64Float";
+   case NVPTXISD::Tld4UnifiedG2DU64Float:
+     return "NVPTXISD::Tld4UnifiedG2DU64Float";
+   case NVPTXISD::Tld4UnifiedB2DU64Float:
+     return "NVPTXISD::Tld4UnifiedB2DU64Float";
+   case NVPTXISD::Tld4UnifiedA2DU64Float:
+     return "NVPTXISD::Tld4UnifiedA2DU64Float";
+ 
+   case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
+   case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
+   case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
+   case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
+   case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
+   case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
+   case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
+   case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
+   case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
+   case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
+   case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
+ 
+   case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
+   case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
+   case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
+   case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
+   case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
+   case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
+   case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
+   case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
+   case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
+   case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
+   case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
+ 
+   case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
+   case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
+   case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
+   case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
+   case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
+   case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
+   case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
+   case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
+   case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
+   case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
+   case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
+ 
+   case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
+   case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
+   case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
+   case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
+   case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
+   case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
+   case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
+   case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
+   case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
+   case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
+   case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
+ 
+   case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
+   case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
+   case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
+   case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
+   case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
+   case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
+   case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
+   case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
+   case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
+   case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
+   case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
+ 
+   case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
+   case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
+   case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
+   case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
+   case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
+   case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
+   case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
+   case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
+   case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
+   case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
+   case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
+ 
+   case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
+   case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
+   case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
+   case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
+   case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
+   case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
+   case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
+   case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
+   case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
+   case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
+   case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
+ 
+   case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
+   case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
+   case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
+   case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
+   case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
+   case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
+   case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
+   case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
+   case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
+   case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
+   case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
+ 
+   case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
+   case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
+   case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
+   case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
+   case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
+   case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
+   case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
+   case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
+   case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
+   case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
+   case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
+ 
+   case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
+   case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
+   case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
+   case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
+   case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
+   case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
+   case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
+   case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
+   case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
+   case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
+   case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
+ 
+   case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
+   case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
+   case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
+   case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
+   case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
+   case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
+   case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
+   case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
+   case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
+   case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
+   case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
+ 
+   case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
+   case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
+   case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
+   case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
+   case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
+   case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
+   case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
+   case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
+   case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
+   case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
+   case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
+ 
+   case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
+   case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
+   case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
+   case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
+   case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
+   case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
+   case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
+   case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
+   case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
+   case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
+   case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
+ 
+   case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
+   case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
+   case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
+   case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
+   case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
+   case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
+   case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
+   case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
+   case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
+   case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
+   case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
+ 
+   case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
+   case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
+   case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
+   case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
+   case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
+   case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
+   case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
+   case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
+   case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
+   case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
+   case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
+   }
+   return nullptr;
+ }
+ 
+ TargetLoweringBase::LegalizeTypeAction
+ NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
+   if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
+     return TypeSplitVector;
+   if (VT == MVT::v2f16)
+     return TypeLegal;
+   return TargetLoweringBase::getPreferredVectorAction(VT);
+ }
+ 
+ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
+                                              int Enabled, int &ExtraSteps,
+                                              bool &UseOneConst,
+                                              bool Reciprocal) const {
+   if (!(Enabled == ReciprocalEstimate::Enabled ||
+         (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
+     return SDValue();
+ 
+   if (ExtraSteps == ReciprocalEstimate::Unspecified)
+     ExtraSteps = 0;
+ 
+   SDLoc DL(Operand);
+   EVT VT = Operand.getValueType();
+   bool Ftz = useF32FTZ(DAG.getMachineFunction());
+ 
+   auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
+     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                        DAG.getConstant(IID, DL, MVT::i32), Operand);
+   };
+ 
+   // The sqrt and rsqrt refinement processes assume we always start out with an
+   // approximation of the rsqrt.  Therefore, if we're going to do any refinement
+   // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing
+   // any refinement, we must return a regular sqrt.
+   if (Reciprocal || ExtraSteps > 0) {
+     if (VT == MVT::f32)
+       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
+                                    : Intrinsic::nvvm_rsqrt_approx_f);
+     else if (VT == MVT::f64)
+       return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
+     else
+       return SDValue();
+   } else {
+     if (VT == MVT::f32)
+       return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
+                                    : Intrinsic::nvvm_sqrt_approx_f);
+     else {
+       // There's no sqrt.approx.f64 instruction, so we emit
+       // reciprocal(rsqrt(x)).  This is faster than
+       // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain
+       // x * rsqrt(x).)
+       return DAG.getNode(
+           ISD::INTRINSIC_WO_CHAIN, DL, VT,
+           DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
+           MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
+     }
+   }
+ }
+ 
+ SDValue
+ NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+   SDLoc dl(Op);
+   const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
+   auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
+   Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
+   return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
+ }
+ 
+ std::string NVPTXTargetLowering::getPrototype(
+     const DataLayout &DL, Type *retTy, const ArgListTy &Args,
+     const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
+     ImmutableCallSite CS) const {
+   auto PtrVT = getPointerTy(DL);
+ 
+   bool isABI = (STI.getSmVersion() >= 20);
+   assert(isABI && "Non-ABI compilation is not supported");
+   if (!isABI)
+     return "";
+ 
+   std::stringstream O;
+   O << "prototype_" << uniqueCallSite << " : .callprototype ";
+ 
+   if (retTy->getTypeID() == Type::VoidTyID) {
+     O << "()";
+   } else {
+     O << "(";
+     if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
+       unsigned size = 0;
+       if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
+         size = ITy->getBitWidth();
+       } else {
+         assert(retTy->isFloatingPointTy() &&
+                "Floating point type expected here");
+         size = retTy->getPrimitiveSizeInBits();
+       }
+       // PTX ABI requires all scalar return values to be at least 32
+       // bits in size.  fp16 normally uses .b16 as its storage type in
+       // PTX, so its size must be adjusted here, too.
+       if (size < 32)
+         size = 32;
+ 
+       O << ".param .b" << size << " _";
+     } else if (isa<PointerType>(retTy)) {
+       O << ".param .b" << PtrVT.getSizeInBits() << " _";
+     } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
+       auto &DL = CS.getCalledFunction()->getParent()->getDataLayout();
+       O << ".param .align " << retAlignment << " .b8 _["
+         << DL.getTypeAllocSize(retTy) << "]";
+     } else {
+       llvm_unreachable("Unknown return type");
+     }
+     O << ") ";
+   }
+   O << "_ (";
+ 
+   bool first = true;
+ 
+   unsigned OIdx = 0;
+   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+     Type *Ty = Args[i].Ty;
+     if (!first) {
+       O << ", ";
+     }
+     first = false;
+ 
+     if (!Outs[OIdx].Flags.isByVal()) {
+       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
+         unsigned align = 0;
+         const CallInst *CallI = cast<CallInst>(CS.getInstruction());
+         // +1 because index 0 is reserved for return type alignment
+         if (!getAlign(*CallI, i + 1, align))
+           align = DL.getABITypeAlignment(Ty);
+         unsigned sz = DL.getTypeAllocSize(Ty);
+         O << ".param .align " << align << " .b8 ";
+         O << "_";
+         O << "[" << sz << "]";
+         // update the index for Outs
+         SmallVector<EVT, 16> vtparts;
+         ComputeValueVTs(*this, DL, Ty, vtparts);
+         if (unsigned len = vtparts.size())
+           OIdx += len - 1;
+         continue;
+       }
+       // i8 types in IR will be i16 types in SDAG
+       assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
+               (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
+              "type mismatch between callee prototype and arguments");
+       // scalar type
+       unsigned sz = 0;
+       if (isa<IntegerType>(Ty)) {
+         sz = cast<IntegerType>(Ty)->getBitWidth();
+         if (sz < 32)
+           sz = 32;
+       } else if (isa<PointerType>(Ty)) {
+         sz = PtrVT.getSizeInBits();
+       } else if (Ty->isHalfTy())
+         // PTX ABI requires all scalar parameters to be at least 32
+         // bits in size.  fp16 normally uses .b16 as its storage type
+         // in PTX, so its size must be adjusted here, too.
+         sz = 32;
+       else
+         sz = Ty->getPrimitiveSizeInBits();
+       O << ".param .b" << sz << " ";
+       O << "_";
+       continue;
+     }
+     auto *PTy = dyn_cast<PointerType>(Ty);
+     assert(PTy && "Param with byval attribute should be a pointer type");
+     Type *ETy = PTy->getElementType();
+ 
+     unsigned align = Outs[OIdx].Flags.getByValAlign();
+     unsigned sz = DL.getTypeAllocSize(ETy);
+     O << ".param .align " << align << " .b8 ";
+     O << "_";
+     O << "[" << sz << "]";
+   }
+   O << ");";
+   return O.str();
+ }
+ 
+ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
+                                                    ImmutableCallSite CS,
+                                                    Type *Ty, unsigned Idx,
+                                                    const DataLayout &DL) const {
+   if (!CS) {
+     // CallSite is zero, fallback to ABI type alignment
+     return DL.getABITypeAlignment(Ty);
+   }
+ 
+   unsigned Align = 0;
+   const Value *DirectCallee = CS.getCalledFunction();
+ 
+   if (!DirectCallee) {
+     // We don't have a direct function symbol, but that may be because of
+     // constant cast instructions in the call.
+     const Instruction *CalleeI = CS.getInstruction();
+     assert(CalleeI && "Call target is not a function or derived value?");
+ 
+     // With bitcast'd call targets, the instruction will be the call
+     if (isa<CallInst>(CalleeI)) {
+       // Check if we have call alignment metadata
+       if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+         return Align;
+ 
+       const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
+       // Ignore any bitcast instructions
+       while (isa<ConstantExpr>(CalleeV)) {
+         const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
+         if (!CE->isCast())
+           break;
+         // Look through the bitcast
+         CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
+       }
+ 
+       // We have now looked past all of the bitcasts.  Do we finally have a
+       // Function?
+       if (isa<Function>(CalleeV))
+         DirectCallee = CalleeV;
+     }
+   }
+ 
+   // Check for function alignment information if we found that the
+   // ultimate target is a Function
+   if (DirectCallee)
+     if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
+       return Align;
+ 
+   // Call is indirect or alignment information is not available, fall back to
+   // the ABI type alignment
+   return DL.getABITypeAlignment(Ty);
+ }
+ 
+ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                        SmallVectorImpl<SDValue> &InVals) const {
+   SelectionDAG &DAG = CLI.DAG;
+   SDLoc dl = CLI.DL;
+   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+   SDValue Chain = CLI.Chain;
+   SDValue Callee = CLI.Callee;
+   bool &isTailCall = CLI.IsTailCall;
+   ArgListTy &Args = CLI.getArgs();
+   Type *RetTy = CLI.RetTy;
+   ImmutableCallSite CS = CLI.CS;
+   const DataLayout &DL = DAG.getDataLayout();
+ 
+   bool isABI = (STI.getSmVersion() >= 20);
+   assert(isABI && "Non-ABI compilation is not supported");
+   if (!isABI)
+     return Chain;
+ 
+   SDValue tempChain = Chain;
+   Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
+   SDValue InFlag = Chain.getValue(1);
+ 
+   unsigned paramCount = 0;
+   // Args.size() and Outs.size() need not match.
+   // Outs.size() will be larger
+   //   * if there is an aggregate argument with multiple fields (each field
+   //     showing up separately in Outs)
+   //   * if there is a vector argument with more than typical vector-length
+   //     elements (generally if more than 4) where each vector element is
+   //     individually present in Outs.
+   // So a different index should be used for indexing into Outs/OutVals.
+   // See similar issue in LowerFormalArguments.
+   unsigned OIdx = 0;
+   // Declare the .params or .reg need to pass values
+   // to the function
+   for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+     EVT VT = Outs[OIdx].VT;
+     Type *Ty = Args[i].Ty;
+ 
+     if (!Outs[OIdx].Flags.isByVal()) {
+       SmallVector<EVT, 16> VTs;
+       SmallVector<uint64_t, 16> Offsets;
+       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
+       unsigned ArgAlign =
+           getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+       unsigned AllocSize = DL.getTypeAllocSize(Ty);
+       SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+       bool NeedAlign; // Does argument declaration specify alignment?
+       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
+         // declare .param .align <align> .b8 .param<n>[<size>];
+         SDValue DeclareParamOps[] = {
+             Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
+             DAG.getConstant(paramCount, dl, MVT::i32),
+             DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
+         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                             DeclareParamOps);
+         NeedAlign = true;
+       } else {
+         // declare .param .b<size> .param<n>;
+         if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
+           // PTX ABI requires integral types to be at least 32 bits in
+           // size. FP16 is loaded/stored using i16, so it's handled
+           // here as well.
+           AllocSize = 4;
+         }
+         SDValue DeclareScalarParamOps[] = {
+             Chain, DAG.getConstant(paramCount, dl, MVT::i32),
+             DAG.getConstant(AllocSize * 8, dl, MVT::i32),
+             DAG.getConstant(0, dl, MVT::i32), InFlag};
+         Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                             DeclareScalarParamOps);
+         NeedAlign = false;
+       }
+       InFlag = Chain.getValue(1);
+ 
+       // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
+       // than 32-bits are sign extended or zero extended, depending on
+       // whether they are signed or unsigned types. This case applies
+       // only to scalar parameters and not to aggregate values.
+       bool ExtendIntegerParam =
+           Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
+ 
+       auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
+       SmallVector<SDValue, 6> StoreOperands;
+       for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
+         // New store.
+         if (VectorInfo[j] & PVF_FIRST) {
+           assert(StoreOperands.empty() && "Unfinished preceeding store.");
+           StoreOperands.push_back(Chain);
+           StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
+           StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
+         }
+ 
+         EVT EltVT = VTs[j];
+         SDValue StVal = OutVals[OIdx];
+         if (ExtendIntegerParam) {
+           assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
+           // zext/sext to i32
+           StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                         : ISD::ZERO_EXTEND,
+                               dl, MVT::i32, StVal);
+         } else if (EltVT.getSizeInBits() < 16) {
+           // Use 16-bit registers for small stores as it's the
+           // smallest general purpose register size supported by NVPTX.
+           StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
+         }
+ 
+         // Record the value to store.
+         StoreOperands.push_back(StVal);
+ 
+         if (VectorInfo[j] & PVF_LAST) {
+           unsigned NumElts = StoreOperands.size() - 3;
+           NVPTXISD::NodeType Op;
+           switch (NumElts) {
+           case 1:
+             Op = NVPTXISD::StoreParam;
+             break;
+           case 2:
+             Op = NVPTXISD::StoreParamV2;
+             break;
+           case 4:
+             Op = NVPTXISD::StoreParamV4;
+             break;
+           default:
+             llvm_unreachable("Invalid vector info.");
+           }
+ 
+           StoreOperands.push_back(InFlag);
+ 
+           // Adjust type of the store op if we've extended the scalar
+           // return value.
+           EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
+           unsigned EltAlign =
+               NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
+ 
+           Chain = DAG.getMemIntrinsicNode(
+               Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
+               TheStoreType, MachinePointerInfo(), EltAlign,
+               MachineMemOperand::MOStore);
+           InFlag = Chain.getValue(1);
+ 
+           // Cleanup.
+           StoreOperands.clear();
+         }
+         ++OIdx;
+       }
+       assert(StoreOperands.empty() && "Unfinished parameter store.");
+       if (VTs.size() > 0)
+         --OIdx;
+       ++paramCount;
+       continue;
+     }
+ 
+     // ByVal arguments
+     SmallVector<EVT, 16> VTs;
+     SmallVector<uint64_t, 16> Offsets;
+     auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
+     assert(PTy && "Type of a byval parameter should be pointer");
+     ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
+ 
+     // declare .param .align <align> .b8 .param<n>[<size>];
+     unsigned sz = Outs[OIdx].Flags.getByValSize();
+     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+     unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
+     // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
+     // so we don't need to worry about natural alignment or not.
+     // See TargetLowering::LowerCallTo().
+ 
+     // Enforce minumum alignment of 4 to work around ptxas miscompile
+     // for sm_50+. See corresponding alignment adjustment in
+     // emitFunctionParamList() for details.
+     if (ArgAlign < 4)
+       ArgAlign = 4;
+     SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
+                                  DAG.getConstant(paramCount, dl, MVT::i32),
+                                  DAG.getConstant(sz, dl, MVT::i32), InFlag};
+     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                         DeclareParamOps);
+     InFlag = Chain.getValue(1);
+     for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
+       EVT elemtype = VTs[j];
+       int curOffset = Offsets[j];
+       unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
+       auto PtrVT = getPointerTy(DL);
+       SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
+                                     DAG.getConstant(curOffset, dl, PtrVT));
+       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+                                    MachinePointerInfo(), PartAlign);
+       if (elemtype.getSizeInBits() < 16) {
+         theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
+       }
+       SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+       SDValue CopyParamOps[] = { Chain,
+                                  DAG.getConstant(paramCount, dl, MVT::i32),
+                                  DAG.getConstant(curOffset, dl, MVT::i32),
+                                  theVal, InFlag };
+       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+                                       CopyParamOps, elemtype,
+                                       MachinePointerInfo(), /* Align */ 0,
+                                       MachineMemOperand::MOStore);
+ 
+       InFlag = Chain.getValue(1);
+     }
+     ++paramCount;
+   }
+ 
+   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
+   unsigned retAlignment = 0;
+ 
+   // Handle Result
+   if (Ins.size() > 0) {
+     SmallVector<EVT, 16> resvtparts;
+     ComputeValueVTs(*this, DL, RetTy, resvtparts);
+ 
+     // Declare
+     //  .param .align 16 .b8 retval0[<size-in-bytes>], or
+     //  .param .b<size-in-bits> retval0
+     unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
+     // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
+     // these three types to match the logic in
+     // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
+     // Plus, this behavior is consistent with nvcc's.
+     if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
+         (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
+       // Scalar needs to be at least 32bit wide
+       if (resultsz < 32)
+         resultsz = 32;
+       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+       SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
+                                   DAG.getConstant(resultsz, dl, MVT::i32),
+                                   DAG.getConstant(0, dl, MVT::i32), InFlag };
+       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+                           DeclareRetOps);
+       InFlag = Chain.getValue(1);
+     } else {
+       retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
+       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+       SDValue DeclareRetOps[] = { Chain,
+                                   DAG.getConstant(retAlignment, dl, MVT::i32),
+                                   DAG.getConstant(resultsz / 8, dl, MVT::i32),
+                                   DAG.getConstant(0, dl, MVT::i32), InFlag };
+       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
+                           DeclareRetOps);
+       InFlag = Chain.getValue(1);
+     }
+   }
+ 
+   // Both indirect calls and libcalls have nullptr Func. In order to distinguish
+   // between them we must rely on the call site value which is valid for
+   // indirect calls but is always null for libcalls.
+   bool isIndirectCall = !Func && CS;
+ 
++  if (isa<ExternalSymbolSDNode>(Callee)) {
++    // Try to find the callee in the current module.
++    Callee = DAG.getSymbolFunctionGlobalAddress(Callee);
++  }
++
+   if (isIndirectCall) {
+     // This is indirect function call case : PTX requires a prototype of the
+     // form
+     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
+     // to be emitted, and the label has to used as the last arg of call
+     // instruction.
+     // The prototype is embedded in a string and put as the operand for a
+     // CallPrototype SDNode which will print out to the value of the string.
+     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+     std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
+     const char *ProtoStr =
+       nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
+     SDValue ProtoOps[] = {
+       Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
+     };
+     Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
+     InFlag = Chain.getValue(1);
+   }
+   // Op to just print "call"
+   SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+   SDValue PrintCallOps[] = {
+     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
+   };
+   // We model convergent calls as separate opcodes.
+   unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
+   if (CLI.IsConvergent)
+     Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
+                                               : NVPTXISD::PrintConvergentCall;
+   Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
+   InFlag = Chain.getValue(1);
+ 
+   // Ops to print out the function name
+   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+   SDValue CallVoidOps[] = { Chain, Callee, InFlag };
+   Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
+   InFlag = Chain.getValue(1);
+ 
+   // Ops to print out the param list
+   SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+   SDValue CallArgBeginOps[] = { Chain, InFlag };
+   Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
+                       CallArgBeginOps);
+   InFlag = Chain.getValue(1);
+ 
+   for (unsigned i = 0, e = paramCount; i != e; ++i) {
+     unsigned opcode;
+     if (i == (e - 1))
+       opcode = NVPTXISD::LastCallArg;
+     else
+       opcode = NVPTXISD::CallArg;
+     SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+     SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
+                              DAG.getConstant(i, dl, MVT::i32), InFlag };
+     Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
+     InFlag = Chain.getValue(1);
+   }
+   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+   SDValue CallArgEndOps[] = { Chain,
+                               DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
+                               InFlag };
+   Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
+   InFlag = Chain.getValue(1);
+ 
+   if (isIndirectCall) {
+     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+     SDValue PrototypeOps[] = { Chain,
+                                DAG.getConstant(uniqueCallSite, dl, MVT::i32),
+                                InFlag };
+     Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
+     InFlag = Chain.getValue(1);
+   }
+ 
++  SmallVector<SDValue, 16> ProxyRegOps;
++  SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
++
+   // Generate loads from param memory/moves from registers for result
+   if (Ins.size() > 0) {
+     SmallVector<EVT, 16> VTs;
+     SmallVector<uint64_t, 16> Offsets;
+     ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
+     assert(VTs.size() == Ins.size() && "Bad value decomposition");
+ 
+     unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
+     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
+ 
+     SmallVector<EVT, 6> LoadVTs;
+     int VecIdx = -1; // Index of the first element of the vector.
+ 
+     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
+     // 32-bits are sign extended or zero extended, depending on whether
+     // they are signed or unsigned types.
+     bool ExtendIntegerRetVal =
+         RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
+ 
+     for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+       bool needTruncate = false;
+       EVT TheLoadType = VTs[i];
+       EVT EltType = Ins[i].VT;
+       unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
+       if (ExtendIntegerRetVal) {
+         TheLoadType = MVT::i32;
+         EltType = MVT::i32;
+         needTruncate = true;
+       } else if (TheLoadType.getSizeInBits() < 16) {
+         if (VTs[i].isInteger())
+           needTruncate = true;
+         EltType = MVT::i16;
+       }
+ 
+       // Record index of the very first element of the vector.
+       if (VectorInfo[i] & PVF_FIRST) {
+         assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
+         VecIdx = i;
+       }
+ 
+       LoadVTs.push_back(EltType);
+ 
+       if (VectorInfo[i] & PVF_LAST) {
+         unsigned NumElts = LoadVTs.size();
+         LoadVTs.push_back(MVT::Other);
+         LoadVTs.push_back(MVT::Glue);
+         NVPTXISD::NodeType Op;
+         switch (NumElts) {
+         case 1:
+           Op = NVPTXISD::LoadParam;
+           break;
+         case 2:
+           Op = NVPTXISD::LoadParamV2;
+           break;
+         case 4:
+           Op = NVPTXISD::LoadParamV4;
+           break;
+         default:
+           llvm_unreachable("Invalid vector info.");
+         }
+ 
+         SDValue LoadOperands[] = {
+             Chain, DAG.getConstant(1, dl, MVT::i32),
+             DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
+         SDValue RetVal = DAG.getMemIntrinsicNode(
+             Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
+             MachinePointerInfo(), EltAlign,
+             MachineMemOperand::MOLoad);
+ 
+         for (unsigned j = 0; j < NumElts; ++j) {
+-          SDValue Ret = RetVal.getValue(j);
++          ProxyRegOps.push_back(RetVal.getValue(j));
++
+           if (needTruncate)
+-            Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret);
+-          InVals.push_back(Ret);
++            ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
++          else
++            ProxyRegTruncates.push_back(Optional<MVT>());
+         }
++
+         Chain = RetVal.getValue(NumElts);
+         InFlag = RetVal.getValue(NumElts + 1);
+ 
+         // Cleanup
+         VecIdx = -1;
+         LoadVTs.clear();
+       }
+     }
+   }
+ 
+   Chain = DAG.getCALLSEQ_END(Chain,
+                              DAG.getIntPtrConstant(uniqueCallSite, dl, true),
+                              DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
+                                                    true),
+                              InFlag, dl);
++  InFlag = Chain.getValue(1);
+   uniqueCallSite++;
+ 
++  // Append ProxyReg instructions to the chain to make sure that `callseq_end`
++  // will not get lost. Otherwise, during libcalls expansion, the nodes can become
++  // dangling.
++  for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
++    SDValue Ret = DAG.getNode(
++      NVPTXISD::ProxyReg, dl,
++      DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
++      { Chain, ProxyRegOps[i], InFlag }
++    );
++
++    Chain = Ret.getValue(1);
++    InFlag = Ret.getValue(2);
++
++    if (ProxyRegTruncates[i].hasValue()) {
++      Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
++    }
++
++    InVals.push_back(Ret);
++  }
++
+   // set isTailCall to false for now, until we figure out how to express
+   // tail call optimization in PTX
+   isTailCall = false;
+   return Chain;
+ }
+ 
+ // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
+ // (see LegalizeDAG.cpp). This is slow and uses local memory.
+ // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
+ SDValue
+ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+   SDNode *Node = Op.getNode();
+   SDLoc dl(Node);
+   SmallVector<SDValue, 8> Ops;
+   unsigned NumOperands = Node->getNumOperands();
+   for (unsigned i = 0; i < NumOperands; ++i) {
+     SDValue SubOp = Node->getOperand(i);
+     EVT VVT = SubOp.getNode()->getValueType(0);
+     EVT EltVT = VVT.getVectorElementType();
+     unsigned NumSubElem = VVT.getVectorNumElements();
+     for (unsigned j = 0; j < NumSubElem; ++j) {
+       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
+                                 DAG.getIntPtrConstant(j, dl)));
+     }
+   }
+   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
+ }
+ 
+ // We can init constant f16x2 with a single .b32 move.  Normally it
+ // would get lowered as two constant loads and vector-packing move.
+ //        mov.b16         %h1, 0x4000;
+ //        mov.b16         %h2, 0x3C00;
+ //        mov.b32         %hh2, {%h2, %h1};
+ // Instead we want just a constant move:
+ //        mov.b32         %hh2, 0x40003C00
+ //
+ // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
+ // generates good SASS in both cases.
+ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                                SelectionDAG &DAG) const {
+   //return Op;
+   if (!(Op->getValueType(0) == MVT::v2f16 &&
+         isa<ConstantFPSDNode>(Op->getOperand(0)) &&
+         isa<ConstantFPSDNode>(Op->getOperand(1))))
+     return Op;
+ 
+   APInt E0 =
+       cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
+   APInt E1 =
+       cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
+   SDValue Const =
+       DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
+   return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
+ }
+ 
+ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+   SDValue Index = Op->getOperand(1);
+   // Constant index will be matched by tablegen.
+   if (isa<ConstantSDNode>(Index.getNode()))
+     return Op;
+ 
+   // Extract individual elements and select one of them.
+   SDValue Vector = Op->getOperand(0);
+   EVT VectorVT = Vector.getValueType();
+   assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
+   EVT EltVT = VectorVT.getVectorElementType();
+ 
+   SDLoc dl(Op.getNode());
+   SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
+                            DAG.getIntPtrConstant(0, dl));
+   SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
+                            DAG.getIntPtrConstant(1, dl));
+   return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
+                          ISD::CondCode::SETEQ);
+ }
+ 
+ /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
+ /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+ ///    amount, or
+ /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+ ///    amount.
+ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+ 
+   EVT VT = Op.getValueType();
+   unsigned VTBits = VT.getSizeInBits();
+   SDLoc dl(Op);
+   SDValue ShOpLo = Op.getOperand(0);
+   SDValue ShOpHi = Op.getOperand(1);
+   SDValue ShAmt  = Op.getOperand(2);
+   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+ 
+   if (VTBits == 32 && STI.getSmVersion() >= 35) {
+     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+     // {dHi, dLo} = {aHi, aLo} >> Amt
+     //   dHi = aHi >> Amt
+     //   dLo = shf.r.clamp aLo, aHi, Amt
+ 
+     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+     SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
+                              ShAmt);
+ 
+     SDValue Ops[2] = { Lo, Hi };
+     return DAG.getMergeValues(Ops, dl);
+   }
+   else {
+     // {dHi, dLo} = {aHi, aLo} >> Amt
+     // - if (Amt>=size) then
+     //      dLo = aHi >> (Amt-size)
+     //      dHi = aHi >> Amt (this is either all 0 or all 1)
+     //   else
+     //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
+     //      dHi = aHi >> Amt
+ 
+     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                    DAG.getConstant(VTBits, dl, MVT::i32),
+                                    ShAmt);
+     SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                      DAG.getConstant(VTBits, dl, MVT::i32));
+     SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+     SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+ 
+     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+                                DAG.getConstant(VTBits, dl, MVT::i32),
+                                ISD::SETGE);
+     SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+     SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+ 
+     SDValue Ops[2] = { Lo, Hi };
+     return DAG.getMergeValues(Ops, dl);
+   }
+ }
+ 
+ /// LowerShiftLeftParts - Lower SHL_PARTS, which
+ /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+ ///    amount, or
+ /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+ ///    amount.
+ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+   assert(Op.getOpcode() == ISD::SHL_PARTS);
+ 
+   EVT VT = Op.getValueType();
+   unsigned VTBits = VT.getSizeInBits();
+   SDLoc dl(Op);
+   SDValue ShOpLo = Op.getOperand(0);
+   SDValue ShOpHi = Op.getOperand(1);
+   SDValue ShAmt  = Op.getOperand(2);
+ 
+   if (VTBits == 32 && STI.getSmVersion() >= 35) {
+     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+     // {dHi, dLo} = {aHi, aLo} << Amt
+     //   dHi = shf.l.clamp aLo, aHi, Amt
+     //   dLo = aLo << Amt
+ 
+     SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
+                              ShAmt);
+     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ 
+     SDValue Ops[2] = { Lo, Hi };
+     return DAG.getMergeValues(Ops, dl);
+   }
+   else {
+     // {dHi, dLo} = {aHi, aLo} << Amt
+     // - if (Amt>=size) then
+     //      dLo = aLo << Amt (all 0)
+     //      dLo = aLo << (Amt-size)
+     //   else
+     //      dLo = aLo << Amt
+     //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
+ 
+     SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                    DAG.getConstant(VTBits, dl, MVT::i32),
+                                    ShAmt);
+     SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+     SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                      DAG.getConstant(VTBits, dl, MVT::i32));
+     SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+     SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+     SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+ 
+     SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+                                DAG.getConstant(VTBits, dl, MVT::i32),
+                                ISD::SETGE);
+     SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+     SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+ 
+     SDValue Ops[2] = { Lo, Hi };
+     return DAG.getMergeValues(Ops, dl);
+   }
+ }
+ 
+ SDValue
+ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+   switch (Op.getOpcode()) {
+   case ISD::RETURNADDR:
+     return SDValue();
+   case ISD::FRAMEADDR:
+     return SDValue();
+   case ISD::GlobalAddress:
+     return LowerGlobalAddress(Op, DAG);
+   case ISD::INTRINSIC_W_CHAIN:
+     return Op;
+   case ISD::BUILD_VECTOR:
+     return LowerBUILD_VECTOR(Op, DAG);
+   case ISD::EXTRACT_SUBVECTOR:
+     return Op;
+   case ISD::EXTRACT_VECTOR_ELT:
+     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+   case ISD::CONCAT_VECTORS:
+     return LowerCONCAT_VECTORS(Op, DAG);
+   case ISD::STORE:
+     return LowerSTORE(Op, DAG);
+   case ISD::LOAD:
+     return LowerLOAD(Op, DAG);
+   case ISD::SHL_PARTS:
+     return LowerShiftLeftParts(Op, DAG);
+   case ISD::SRA_PARTS:
+   case ISD::SRL_PARTS:
+     return LowerShiftRightParts(Op, DAG);
+   case ISD::SELECT:
+     return LowerSelect(Op, DAG);
+   default:
+     llvm_unreachable("Custom lowering not defined for operation");
+   }
+ }
+ 
+ SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
+   SDValue Op0 = Op->getOperand(0);
+   SDValue Op1 = Op->getOperand(1);
+   SDValue Op2 = Op->getOperand(2);
+   SDLoc DL(Op.getNode());
+ 
+   assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
+ 
+   Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
+   Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
+   SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
+   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
+ 
+   return Trunc;
+ }
+ 
+ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+   if (Op.getValueType() == MVT::i1)
+     return LowerLOADi1(Op, DAG);
+ 
+   // v2f16 is legal, so we can't rely on legalizer to handle unaligned
+   // loads and have to handle it here.
+   if (Op.getValueType() == MVT::v2f16) {
+     LoadSDNode *Load = cast<LoadSDNode>(Op);
+     EVT MemVT = Load->getMemoryVT();
+     if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                             Load->getAddressSpace(), Load->getAlignment())) {
+       SDValue Ops[2];
+       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+       return DAG.getMergeValues(Ops, SDLoc(Op));
+     }
+   }
+ 
+   return SDValue();
+ }
+ 
+ // v = ld i1* addr
+ //   =>
+ // v1 = ld i8* addr (-> i16)
+ // v = trunc i16 to i1
+ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
+   SDNode *Node = Op.getNode();
+   LoadSDNode *LD = cast<LoadSDNode>(Node);
+   SDLoc dl(Node);
+   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
+   assert(Node->getValueType(0) == MVT::i1 &&
+          "Custom lowering for i1 load only");
+   SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
+                               LD->getPointerInfo(), LD->getAlignment(),
+                               LD->getMemOperand()->getFlags());
+   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
+   // The legalizer (the caller) is expecting two values from the legalized
+   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
+   // in LegalizeDAG.cpp which also uses MergeValues.
+   SDValue Ops[] = { result, LD->getChain() };
+   return DAG.getMergeValues(Ops, dl);
+ }
+ 
+ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+   StoreSDNode *Store = cast<StoreSDNode>(Op);
+   EVT VT = Store->getMemoryVT();
+ 
+   if (VT == MVT::i1)
+     return LowerSTOREi1(Op, DAG);
+ 
+   // v2f16 is legal, so we can't rely on legalizer to handle unaligned
+   // stores and have to handle it here.
+   if (VT == MVT::v2f16 &&
+       !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                           Store->getAddressSpace(), Store->getAlignment()))
+     return expandUnalignedStore(Store, DAG);
+ 
+   if (VT.isVector())
+     return LowerSTOREVector(Op, DAG);
+ 
+   return SDValue();
+ }
+ 
+ SDValue
+ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+   SDNode *N = Op.getNode();
+   SDValue Val = N->getOperand(1);
+   SDLoc DL(N);
+   EVT ValVT = Val.getValueType();
+ 
+   if (ValVT.isVector()) {
+     // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+     // legal.  We can (and should) split that into 2 stores of <2 x double> here
+     // but I'm leaving that as a TODO for now.
+     if (!ValVT.isSimple())
+       return SDValue();
+     switch (ValVT.getSimpleVT().SimpleTy) {
+     default:
+       return SDValue();
+     case MVT::v2i8:
+     case MVT::v2i16:
+     case MVT::v2i32:
+     case MVT::v2i64:
+     case MVT::v2f16:
+     case MVT::v2f32:
+     case MVT::v2f64:
+     case MVT::v4i8:
+     case MVT::v4i16:
+     case MVT::v4i32:
+     case MVT::v4f16:
+     case MVT::v4f32:
+     case MVT::v8f16: // <4 x f16x2>
+       // This is a "native" vector type
+       break;
+     }
+ 
+     MemSDNode *MemSD = cast<MemSDNode>(N);
+     const DataLayout &TD = DAG.getDataLayout();
+ 
+     unsigned Align = MemSD->getAlignment();
+     unsigned PrefAlign =
+         TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
+     if (Align < PrefAlign) {
+       // This store is not sufficiently aligned, so bail out and let this vector
+       // store be scalarized.  Note that we may still be able to emit smaller
+       // vector stores.  For example, if we are storing a <4 x float> with an
+       // alignment of 8, this check will fail but the legalizer will try again
+       // with 2 x <2 x float>, which will succeed with an alignment of 8.
+       return SDValue();
+     }
+ 
+     unsigned Opcode = 0;
+     EVT EltVT = ValVT.getVectorElementType();
+     unsigned NumElts = ValVT.getVectorNumElements();
+ 
+     // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
+     // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+     // stored type to i16 and propagate the "real" type as the memory type.
+     bool NeedExt = false;
+     if (EltVT.getSizeInBits() < 16)
+       NeedExt = true;
+ 
+     bool StoreF16x2 = false;
+     switch (NumElts) {
+     default:
+       return SDValue();
+     case 2:
+       Opcode = NVPTXISD::StoreV2;
+       break;
+     case 4:
+       Opcode = NVPTXISD::StoreV4;
+       break;
+     case 8:
+       // v8f16 is a special case. PTX doesn't have st.v8.f16
+       // instruction. Instead, we split the vector into v2f16 chunks and
+       // store them with st.v4.b32.
+       assert(EltVT == MVT::f16 && "Wrong type for the vector.");
+       Opcode = NVPTXISD::StoreV4;
+       StoreF16x2 = true;
+       break;
+     }
+ 
+     SmallVector<SDValue, 8> Ops;
+ 
+     // First is the chain
+     Ops.push_back(N->getOperand(0));
+ 
+     if (StoreF16x2) {
+       // Combine f16,f16 -> v2f16
+       NumElts /= 2;
+       for (unsigned i = 0; i < NumElts; ++i) {
+         SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
+                                  DAG.getIntPtrConstant(i * 2, DL));
+         SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
+                                  DAG.getIntPtrConstant(i * 2 + 1, DL));
+         SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
+         Ops.push_back(V2);
+       }
+     } else {
+       // Then the split values
+       for (unsigned i = 0; i < NumElts; ++i) {
+         SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+                                      DAG.getIntPtrConstant(i, DL));
+         if (NeedExt)
+           ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+         Ops.push_back(ExtVal);
+       }
+     }
+ 
+     // Then any remaining arguments
+     Ops.append(N->op_begin() + 2, N->op_end());
+ 
+     SDValue NewSt =
+         DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
+                                 MemSD->getMemoryVT(), MemSD->getMemOperand());
+ 
+     // return DCI.CombineTo(N, NewSt, true);
+     return NewSt;
+   }
+ 
+   return SDValue();
+ }
+ 
+ // st i1 v, addr
+ //    =>
+ // v1 = zxt v to i16
+ // st.u8 i16, addr
+ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
+   SDNode *Node = Op.getNode();
+   SDLoc dl(Node);
+   StoreSDNode *ST = cast<StoreSDNode>(Node);
+   SDValue Tmp1 = ST->getChain();
+   SDValue Tmp2 = ST->getBasePtr();
+   SDValue Tmp3 = ST->getValue();
+   assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
+   Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
+   SDValue Result =
+       DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
+                         ST->getAlignment(), ST->getMemOperand()->getFlags());
+   return Result;
+ }
+ 
+ SDValue
+ NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
+   std::string ParamSym;
+   raw_string_ostream ParamStr(ParamSym);
+ 
+   ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
+   ParamStr.flush();
+ 
+   std::string *SavedStr =
+     nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
+   return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
+ }
+ 
+ // Check to see if the kernel argument is image*_t or sampler_t
+ 
+ static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
+   static const char *const specialTypes[] = { "struct._image2d_t",
+                                               "struct._image3d_t",
+                                               "struct._sampler_t" };
+ 
+   Type *Ty = arg->getType();
+   auto *PTy = dyn_cast<PointerType>(Ty);
+ 
+   if (!PTy)
+     return false;
+ 
+   if (!context)
+     return false;
+ 
+   auto *STy = dyn_cast<StructType>(PTy->getElementType());
+   if (!STy || STy->isLiteral())
+     return false;
+ 
+   return std::find(std::begin(specialTypes), std::end(specialTypes),
+                    STy->getName()) != std::end(specialTypes);
+ }
+ 
+ SDValue NVPTXTargetLowering::LowerFormalArguments(
+     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+   MachineFunction &MF = DAG.getMachineFunction();
+   const DataLayout &DL = DAG.getDataLayout();
+   auto PtrVT = getPointerTy(DAG.getDataLayout());
+ 
+   const Function *F = &MF.getFunction();
+   const AttributeList &PAL = F->getAttributes();
+   const TargetLowering *TLI = STI.getTargetLowering();
+ 
+   SDValue Root = DAG.getRoot();
+   std::vector<SDValue> OutChains;
+ 
+   bool isABI = (STI.getSmVersion() >= 20);
+   assert(isABI && "Non-ABI compilation is not supported");
+   if (!isABI)
+     return Chain;
+ 
+   std::vector<Type *> argTypes;
+   std::vector<const Argument *> theArgs;
+   for (const Argument &I : F->args()) {
+     theArgs.push_back(&I);
+     argTypes.push_back(I.getType());
+   }
+   // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
+   // Ins.size() will be larger
+   //   * if there is an aggregate argument with multiple fields (each field
+   //     showing up separately in Ins)
+   //   * if there is a vector argument with more than typical vector-length
+   //     elements (generally if more than 4) where each vector element is
+   //     individually present in Ins.
+   // So a different index should be used for indexing into Ins.
+   // See similar issue in LowerCall.
+   unsigned InsIdx = 0;
+ 
+   int idx = 0;
+   for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
+     Type *Ty = argTypes[i];
+ 
+     // If the kernel argument is image*_t or sampler_t, convert it to
+     // a i32 constant holding the parameter position. This can later
+     // matched in the AsmPrinter to output the correct mangled name.
+     if (isImageOrSamplerVal(
+             theArgs[i],
+             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
+                                      : nullptr))) {
+       assert(isKernelFunction(*F) &&
+              "Only kernels can have image/sampler params");
+       InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
+       continue;
+     }
+ 
+     if (theArgs[i]->use_empty()) {
+       // argument is dead
+       if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
+         SmallVector<EVT, 16> vtparts;
+ 
+         ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
+         assert(vtparts.size() > 0 && "empty aggregate type not expected");
+         for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+              ++parti) {
+           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+           ++InsIdx;
+         }
+         if (vtparts.size() > 0)
+           --InsIdx;
+         continue;
+       }
+       if (Ty->isVectorTy()) {
+         EVT ObjectVT = getValueType(DL, Ty);
+         unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
+         for (unsigned parti = 0; parti < NumRegs; ++parti) {
+           InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+           ++InsIdx;
+         }
+         if (NumRegs > 0)
+           --InsIdx;
+         continue;
+       }
+       InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+       continue;
+     }
+ 
+     // In the following cases, assign a node order of "idx+1"
+     // to newly created nodes. The SDNodes for params have to
+     // appear in the same order as their order of appearance
+     // in the original function. "idx+1" holds that order.
+     if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
+       bool aggregateIsPacked = false;
+       if (StructType *STy = dyn_cast<StructType>(Ty))
+         aggregateIsPacked = STy->isPacked();
+ 
+       SmallVector<EVT, 16> VTs;
+       SmallVector<uint64_t, 16> Offsets;
+       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
+       assert(VTs.size() > 0 && "Unexpected empty type.");
+       auto VectorInfo =
+           VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
+ 
+       SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+       int VecIdx = -1; // Index of the first element of the current vector.
+       for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
+         if (VectorInfo[parti] & PVF_FIRST) {
+           assert(VecIdx == -1 && "Orphaned vector.");
+           VecIdx = parti;
+         }
+ 
+         // That's the last element of this store op.
+         if (VectorInfo[parti] & PVF_LAST) {
+           unsigned NumElts = parti - VecIdx + 1;
+           EVT EltVT = VTs[parti];
+           // i1 is loaded/stored as i8.
+           EVT LoadVT = EltVT;
+           if (EltVT == MVT::i1)
+             LoadVT = MVT::i8;
+           else if (EltVT == MVT::v2f16)
+             // getLoad needs a vector type, but it can't handle
+             // vectors which contain v2f16 elements. So we must load
+             // using i32 here and then bitcast back.
+             LoadVT = MVT::i32;
+ 
+           EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
+           SDValue VecAddr =
+               DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+                           DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
+           Value *srcValue = Constant::getNullValue(PointerType::get(
+               EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
+           SDValue P =
+               DAG.getLoad(VecVT, dl, Root, VecAddr,
+                           MachinePointerInfo(srcValue), aggregateIsPacked,
+                           MachineMemOperand::MODereferenceable |
+                               MachineMemOperand::MOInvariant);
+           if (P.getNode())
+             P.getNode()->setIROrder(idx + 1);
+           for (unsigned j = 0; j < NumElts; ++j) {
+             SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
+                                       DAG.getIntPtrConstant(j, dl));
+             // We've loaded i1 as an i8 and now must truncate it back to i1
+             if (EltVT == MVT::i1)
+               Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
+             // v2f16 was loaded as an i32. Now we must bitcast it back.
+             else if (EltVT == MVT::v2f16)
+               Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
+             // Extend the element if necessary (e.g. an i8 is loaded
+             // into an i16 register)
+             if (Ins[InsIdx].VT.isInteger() &&
+                 Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
+               unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                            : ISD::ZERO_EXTEND;
+               Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
+             }
+             InVals.push_back(Elt);
+           }
+ 
+           // Reset vector tracking state.
+           VecIdx = -1;
+         }
+         ++InsIdx;
+       }
+       if (VTs.size() > 0)
+         --InsIdx;
+       continue;
+     }
+ 
+     // Param has ByVal attribute
+     // Return MoveParam(param symbol).
+     // Ideally, the param symbol can be returned directly,
+     // but when SDNode builder decides to use it in a CopyToReg(),
+     // machine instruction fails because TargetExternalSymbol
+     // (not lowered) is target dependent, and CopyToReg assumes
+     // the source is lowered.
+     EVT ObjectVT = getValueType(DL, Ty);
+     assert(ObjectVT == Ins[InsIdx].VT &&
+            "Ins type did not match function type");
+     SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+     if (p.getNode())
+       p.getNode()->setIROrder(idx + 1);
+     InVals.push_back(p);
+   }
+ 
+   // Clang will check explicit VarArg and issue error if any. However, Clang
+   // will let code with
+   // implicit var arg like f() pass. See bug 617733.
+   // We treat this case as if the arg list is empty.
+   // if (F.isVarArg()) {
+   // assert(0 && "VarArg not supported yet!");
+   //}
+ 
+   if (!OutChains.empty())
+     DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
+ 
+   return Chain;
+ }
+ 
+ SDValue
+ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                  bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<SDValue> &OutVals,
+                                  const SDLoc &dl, SelectionDAG &DAG) const {
+   MachineFunction &MF = DAG.getMachineFunction();
+   Type *RetTy = MF.getFunction().getReturnType();
+ 
+   bool isABI = (STI.getSmVersion() >= 20);
+   assert(isABI && "Non-ABI compilation is not supported");
+   if (!isABI)
+     return Chain;
+ 
+   const DataLayout DL = DAG.getDataLayout();
+   SmallVector<EVT, 16> VTs;
+   SmallVector<uint64_t, 16> Offsets;
+   ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
+   assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
+ 
+   auto VectorInfo = VectorizePTXValueVTs(
+       VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
+ 
+   // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
+   // 32-bits are sign extended or zero extended, depending on whether
+   // they are signed or unsigned types.
+   bool ExtendIntegerRetVal =
+       RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
+ 
+   SmallVector<SDValue, 6> StoreOperands;
+   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+     // New load/store. Record chain and offset operands.
+     if (VectorInfo[i] & PVF_FIRST) {
+       assert(StoreOperands.empty() && "Orphaned operand list.");
+       StoreOperands.push_back(Chain);
+       StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
+     }
+ 
+     SDValue RetVal = OutVals[i];
+     if (ExtendIntegerRetVal) {
+       RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                   : ISD::ZERO_EXTEND,
+                            dl, MVT::i32, RetVal);
+     } else if (RetVal.getValueSizeInBits() < 16) {
+       // Use 16-bit registers for small load-stores as it's the
+       // smallest general purpose register size supported by NVPTX.
+       RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
+     }
+ 
+     // Record the value to return.
+     StoreOperands.push_back(RetVal);
+ 
+     // That's the last element of this store op.
+     if (VectorInfo[i] & PVF_LAST) {
+       NVPTXISD::NodeType Op;
+       unsigned NumElts = StoreOperands.size() - 2;
+       switch (NumElts) {
+       case 1:
+         Op = NVPTXISD::StoreRetval;
+         break;
+       case 2:
+         Op = NVPTXISD::StoreRetvalV2;
+         break;
+       case 4:
+         Op = NVPTXISD::StoreRetvalV4;
+         break;
+       default:
+         llvm_unreachable("Invalid vector info.");
+       }
+ 
+       // Adjust type of load/store op if we've extended the scalar
+       // return value.
+       EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
+       Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
+                                       StoreOperands, TheStoreType,
+                                       MachinePointerInfo(), /* Align */ 1,
+                                       MachineMemOperand::MOStore);
+       // Cleanup vector state.
+       StoreOperands.clear();
+     }
+   }
+ 
+   return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
+ }
+ 
+ void NVPTXTargetLowering::LowerAsmOperandForConstraint(
+     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+     SelectionDAG &DAG) const {
+   if (Constraint.length() > 1)
+     return;
+   else
+     TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+ }
+ 
+ static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
+   switch (Intrinsic) {
+   default:
+     return 0;
+ 
+   case Intrinsic::nvvm_tex_1d_v4f32_s32:
+     return NVPTXISD::Tex1DFloatS32;
+   case Intrinsic::nvvm_tex_1d_v4f32_f32:
+     return NVPTXISD::Tex1DFloatFloat;
+   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+     return NVPTXISD::Tex1DFloatFloatLevel;
+   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+     return NVPTXISD::Tex1DFloatFloatGrad;
+   case Intrinsic::nvvm_tex_1d_v4s32_s32:
+     return NVPTXISD::Tex1DS32S32;
+   case Intrinsic::nvvm_tex_1d_v4s32_f32:
+     return NVPTXISD::Tex1DS32Float;
+   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+     return NVPTXISD::Tex1DS32FloatLevel;
+   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+     return NVPTXISD::Tex1DS32FloatGrad;
+   case Intrinsic::nvvm_tex_1d_v4u32_s32:
+     return NVPTXISD::Tex1DU32S32;
+   case Intrinsic::nvvm_tex_1d_v4u32_f32:
+     return NVPTXISD::Tex1DU32Float;
+   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+     return NVPTXISD::Tex1DU32FloatLevel;
+   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+     return NVPTXISD::Tex1DU32FloatGrad;
+ 
+   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+     return NVPTXISD::Tex1DArrayFloatS32;
+   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+     return NVPTXISD::Tex1DArrayFloatFloat;
+   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+     return NVPTXISD::Tex1DArrayFloatFloatLevel;
+   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+     return NVPTXISD::Tex1DArrayFloatFloatGrad;
+   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+     return NVPTXISD::Tex1DArrayS32S32;
+   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+     return NVPTXISD::Tex1DArrayS32Float;
+   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+     return NVPTXISD::Tex1DArrayS32FloatLevel;
+   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+     return NVPTXISD::Tex1DArrayS32FloatGrad;
+   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+     return NVPTXISD::Tex1DArrayU32S32;
+   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+     return NVPTXISD::Tex1DArrayU32Float;
+   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+     return NVPTXISD::Tex1DArrayU32FloatLevel;
+   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+     return NVPTXISD::Tex1DArrayU32FloatGrad;
+ 
+   case Intrinsic::nvvm_tex_2d_v4f32_s32:
+     return NVPTXISD::Tex2DFloatS32;
+   case Intrinsic::nvvm_tex_2d_v4f32_f32:
+     return NVPTXISD::Tex2DFloatFloat;
+   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+     return NVPTXISD::Tex2DFloatFloatLevel;
+   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+     return NVPTXISD::Tex2DFloatFloatGrad;
+   case Intrinsic::nvvm_tex_2d_v4s32_s32:
+     return NVPTXISD::Tex2DS32S32;
+   case Intrinsic::nvvm_tex_2d_v4s32_f32:
+     return NVPTXISD::Tex2DS32Float;
+   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+     return NVPTXISD::Tex2DS32FloatLevel;
+   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+     return NVPTXISD::Tex2DS32FloatGrad;
+   case Intrinsic::nvvm_tex_2d_v4u32_s32:
+     return NVPTXISD::Tex2DU32S32;
+   case Intrinsic::nvvm_tex_2d_v4u32_f32:
+     return NVPTXISD::Tex2DU32Float;
+   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+     return NVPTXISD::Tex2DU32FloatLevel;
+   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+     return NVPTXISD::Tex2DU32FloatGrad;
+ 
+   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+     return NVPTXISD::Tex2DArrayFloatS32;
+   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+     return NVPTXISD::Tex2DArrayFloatFloat;
+   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+     return NVPTXISD::Tex2DArrayFloatFloatLevel;
+   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+     return NVPTXISD::Tex2DArrayFloatFloatGrad;
+   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+     return NVPTXISD::Tex2DArrayS32S32;
+   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+     return NVPTXISD::Tex2DArrayS32Float;
+   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+     return NVPTXISD::Tex2DArrayS32FloatLevel;
+   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+     return NVPTXISD::Tex2DArrayS32FloatGrad;
+   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+     return NVPTXISD::Tex2DArrayU32S32;
+   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+     return NVPTXISD::Tex2DArrayU32Float;
+   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+     return NVPTXISD::Tex2DArrayU32FloatLevel;
+   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+     return NVPTXISD::Tex2DArrayU32FloatGrad;
+ 
+   case Intrinsic::nvvm_tex_3d_v4f32_s32:
+     return NVPTXISD::Tex3DFloatS32;
+   case Intrinsic::nvvm_tex_3d_v4f32_f32:
+     return NVPTXISD::Tex3DFloatFloat;
+   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+     return NVPTXISD::Tex3DFloatFloatLevel;
+   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+     return NVPTXISD::Tex3DFloatFloatGrad;
+   case Intrinsic::nvvm_tex_3d_v4s32_s32:
+     return NVPTXISD::Tex3DS32S32;
+   case Intrinsic::nvvm_tex_3d_v4s32_f32:
+     return NVPTXISD::Tex3DS32Float;
+   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+     return NVPTXISD::Tex3DS32FloatLevel;
+   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+     return NVPTXISD::Tex3DS32FloatGrad;
+   case Intrinsic::nvvm_tex_3d_v4u32_s32:
+     return NVPTXISD::Tex3DU32S32;
+   case Intrinsic::nvvm_tex_3d_v4u32_f32:
+     return NVPTXISD::Tex3DU32Float;
+   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+     return NVPTXISD::Tex3DU32FloatLevel;
+   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+     return NVPTXISD::Tex3DU32FloatGrad;
+ 
+   case Intrinsic::nvvm_tex_cube_v4f32_f32:
+     return NVPTXISD::TexCubeFloatFloat;
+   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+     return NVPTXISD::TexCubeFloatFloatLevel;
+   case Intrinsic::nvvm_tex_cube_v4s32_f32:
+     return NVPTXISD::TexCubeS32Float;
+   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+     return NVPTXISD::TexCubeS32FloatLevel;
+   case Intrinsic::nvvm_tex_cube_v4u32_f32:
+     return NVPTXISD::TexCubeU32Float;
+   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+     return NVPTXISD::TexCubeU32FloatLevel;
+ 
+   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+     return NVPTXISD::TexCubeArrayFloatFloat;
+   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+     return NVPTXISD::TexCubeArrayFloatFloatLevel;
+   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+     return NVPTXISD::TexCubeArrayS32Float;
+   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+     return NVPTXISD::TexCubeArrayS32FloatLevel;
+   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+     return NVPTXISD::TexCubeArrayU32Float;
+   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+     return NVPTXISD::TexCubeArrayU32FloatLevel;
+ 
+   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+     return NVPTXISD::Tld4R2DFloatFloat;
+   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+     return NVPTXISD::Tld4G2DFloatFloat;
+   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+     return NVPTXISD::Tld4B2DFloatFloat;
+   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+     return NVPTXISD::Tld4A2DFloatFloat;
+   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+     return NVPTXISD::Tld4R2DS64Float;
+   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+     return NVPTXISD::Tld4G2DS64Float;
+   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+     return NVPTXISD::Tld4B2DS64Float;
+   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+     return NVPTXISD::Tld4A2DS64Float;
+   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+     return NVPTXISD::Tld4R2DU64Float;
+   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+     return NVPTXISD::Tld4G2DU64Float;
+   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+     return NVPTXISD::Tld4B2DU64Float;
+   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+     return NVPTXISD::Tld4A2DU64Float;
+ 
+   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+     return NVPTXISD::TexUnified1DFloatS32;
+   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+     return NVPTXISD::TexUnified1DFloatFloat;
+   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+     return NVPTXISD::TexUnified1DFloatFloatLevel;
+   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+     return NVPTXISD::TexUnified1DFloatFloatGrad;
+   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+     return NVPTXISD::TexUnified1DS32S32;
+   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+     return NVPTXISD::TexUnified1DS32Float;
+   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+     return NVPTXISD::TexUnified1DS32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+     return NVPTXISD::TexUnified1DS32FloatGrad;
+   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+     return NVPTXISD::TexUnified1DU32S32;
+   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+     return NVPTXISD::TexUnified1DU32Float;
+   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+     return NVPTXISD::TexUnified1DU32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+     return NVPTXISD::TexUnified1DU32FloatGrad;
+ 
+   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+     return NVPTXISD::TexUnified1DArrayFloatS32;
+   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+     return NVPTXISD::TexUnified1DArrayFloatFloat;
+   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+     return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
+   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+     return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
+   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+     return NVPTXISD::TexUnified1DArrayS32S32;
+   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+     return NVPTXISD::TexUnified1DArrayS32Float;
+   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+     return NVPTXISD::TexUnified1DArrayS32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+     return NVPTXISD::TexUnified1DArrayS32FloatGrad;
+   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+     return NVPTXISD::TexUnified1DArrayU32S32;
+   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+     return NVPTXISD::TexUnified1DArrayU32Float;
+   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+     return NVPTXISD::TexUnified1DArrayU32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+     return NVPTXISD::TexUnified1DArrayU32FloatGrad;
+ 
+   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+     return NVPTXISD::TexUnified2DFloatS32;
+   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+     return NVPTXISD::TexUnified2DFloatFloat;
+   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+     return NVPTXISD::TexUnified2DFloatFloatLevel;
+   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+     return NVPTXISD::TexUnified2DFloatFloatGrad;
+   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+     return NVPTXISD::TexUnified2DS32S32;
+   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+     return NVPTXISD::TexUnified2DS32Float;
+   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+     return NVPTXISD::TexUnified2DS32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+     return NVPTXISD::TexUnified2DS32FloatGrad;
+   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+     return NVPTXISD::TexUnified2DU32S32;
+   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+     return NVPTXISD::TexUnified2DU32Float;
+   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+     return NVPTXISD::TexUnified2DU32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+     return NVPTXISD::TexUnified2DU32FloatGrad;
+ 
+   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+     return NVPTXISD::TexUnified2DArrayFloatS32;
+   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+     return NVPTXISD::TexUnified2DArrayFloatFloat;
+   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+     return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
+   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+     return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
+   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+     return NVPTXISD::TexUnified2DArrayS32S32;
+   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+     return NVPTXISD::TexUnified2DArrayS32Float;
+   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+     return NVPTXISD::TexUnified2DArrayS32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+     return NVPTXISD::TexUnified2DArrayS32FloatGrad;
+   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+     return NVPTXISD::TexUnified2DArrayU32S32;
+   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+     return NVPTXISD::TexUnified2DArrayU32Float;
+   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+     return NVPTXISD::TexUnified2DArrayU32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+     return NVPTXISD::TexUnified2DArrayU32FloatGrad;
+ 
+   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+     return NVPTXISD::TexUnified3DFloatS32;
+   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+     return NVPTXISD::TexUnified3DFloatFloat;
+   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+     return NVPTXISD::TexUnified3DFloatFloatLevel;
+   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+     return NVPTXISD::TexUnified3DFloatFloatGrad;
+   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+     return NVPTXISD::TexUnified3DS32S32;
+   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+     return NVPTXISD::TexUnified3DS32Float;
+   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+     return NVPTXISD::TexUnified3DS32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+     return NVPTXISD::TexUnified3DS32FloatGrad;
+   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+     return NVPTXISD::TexUnified3DU32S32;
+   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+     return NVPTXISD::TexUnified3DU32Float;
+   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+     return NVPTXISD::TexUnified3DU32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+     return NVPTXISD::TexUnified3DU32FloatGrad;
+ 
+   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+     return NVPTXISD::TexUnifiedCubeFloatFloat;
+   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+     return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
+   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+     return NVPTXISD::TexUnifiedCubeS32Float;
+   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+     return NVPTXISD::TexUnifiedCubeS32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+     return NVPTXISD::TexUnifiedCubeU32Float;
+   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+     return NVPTXISD::TexUnifiedCubeU32FloatLevel;
+ 
+   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+     return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
+   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+     return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
+   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+     return NVPTXISD::TexUnifiedCubeArrayS32Float;
+   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+     return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
+   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+     return NVPTXISD::TexUnifiedCubeArrayU32Float;
+   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+     return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
+ 
+   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+     return NVPTXISD::Tld4UnifiedR2DFloatFloat;
+   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+     return NVPTXISD::Tld4UnifiedG2DFloatFloat;
+   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+     return NVPTXISD::Tld4UnifiedB2DFloatFloat;
+   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
+     return NVPTXISD::Tld4UnifiedA2DFloatFloat;
+   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+     return NVPTXISD::Tld4UnifiedR2DS64Float;
+   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+     return NVPTXISD::Tld4UnifiedG2DS64Float;
+   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+     return NVPTXISD::Tld4UnifiedB2DS64Float;
+   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+     return NVPTXISD::Tld4UnifiedA2DS64Float;
+   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+     return NVPTXISD::Tld4UnifiedR2DU64Float;
+   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+     return NVPTXISD::Tld4UnifiedG2DU64Float;
+   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+     return NVPTXISD::Tld4UnifiedB2DU64Float;
+   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
+     return NVPTXISD::Tld4UnifiedA2DU64Float;
+   }
+ }
+ 
+ static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
+   switch (Intrinsic) {
+   default:
+     return 0;
+   case Intrinsic::nvvm_suld_1d_i8_clamp:
+     return NVPTXISD::Suld1DI8Clamp;
+   case Intrinsic::nvvm_suld_1d_i16_clamp:
+     return NVPTXISD::Suld1DI16Clamp;
+   case Intrinsic::nvvm_suld_1d_i32_clamp:
+     return NVPTXISD::Suld1DI32Clamp;
+   case Intrinsic::nvvm_suld_1d_i64_clamp:
+     return NVPTXISD::Suld1DI64Clamp;
+   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+     return NVPTXISD::Suld1DV2I8Clamp;
+   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+     return NVPTXISD::Suld1DV2I16Clamp;
+   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+     return NVPTXISD::Suld1DV2I32Clamp;
+   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+     return NVPTXISD::Suld1DV2I64Clamp;
+   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+     return NVPTXISD::Suld1DV4I8Clamp;
+   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+     return NVPTXISD::Suld1DV4I16Clamp;
+   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+     return NVPTXISD::Suld1DV4I32Clamp;
+   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+     return NVPTXISD::Suld1DArrayI8Clamp;
+   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+     return NVPTXISD::Suld1DArrayI16Clamp;
+   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+     return NVPTXISD::Suld1DArrayI32Clamp;
+   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+     return NVPTXISD::Suld1DArrayI64Clamp;
+   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+     return NVPTXISD::Suld1DArrayV2I8Clamp;
+   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+     return NVPTXISD::Suld1DArrayV2I16Clamp;
+   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+     return NVPTXISD::Suld1DArrayV2I32Clamp;
+   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+     return NVPTXISD::Suld1DArrayV2I64Clamp;
+   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+     return NVPTXISD::Suld1DArrayV4I8Clamp;
+   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+     return NVPTXISD::Suld1DArrayV4I16Clamp;
+   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+     return NVPTXISD::Suld1DArrayV4I32Clamp;
+   case Intrinsic::nvvm_suld_2d_i8_clamp:
+     return NVPTXISD::Suld2DI8Clamp;
+   case Intrinsic::nvvm_suld_2d_i16_clamp:
+     return NVPTXISD::Suld2DI16Clamp;
+   case Intrinsic::nvvm_suld_2d_i32_clamp:
+     return NVPTXISD::Suld2DI32Clamp;
+   case Intrinsic::nvvm_suld_2d_i64_clamp:
+     return NVPTXISD::Suld2DI64Clamp;
+   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+     return NVPTXISD::Suld2DV2I8Clamp;
+   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+     return NVPTXISD::Suld2DV2I16Clamp;
+   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+     return NVPTXISD::Suld2DV2I32Clamp;
+   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+     return NVPTXISD::Suld2DV2I64Clamp;
+   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+     return NVPTXISD::Suld2DV4I8Clamp;
+   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+     return NVPTXISD::Suld2DV4I16Clamp;
+   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+     return NVPTXISD::Suld2DV4I32Clamp;
+   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+     return NVPTXISD::Suld2DArrayI8Clamp;
+   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+     return NVPTXISD::Suld2DArrayI16Clamp;
+   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+     return NVPTXISD::Suld2DArrayI32Clamp;
+   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+     return NVPTXISD::Suld2DArrayI64Clamp;
+   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+     return NVPTXISD::Suld2DArrayV2I8Clamp;
+   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+     return NVPTXISD::Suld2DArrayV2I16Clamp;
+   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+     return NVPTXISD::Suld2DArrayV2I32Clamp;
+   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+     return NVPTXISD::Suld2DArrayV2I64Clamp;
+   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+     return NVPTXISD::Suld2DArrayV4I8Clamp;
+   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+     return NVPTXISD::Suld2DArrayV4I16Clamp;
+   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+     return NVPTXISD::Suld2DArrayV4I32Clamp;
+   case Intrinsic::nvvm_suld_3d_i8_clamp:
+     return NVPTXISD::Suld3DI8Clamp;
+   case Intrinsic::nvvm_suld_3d_i16_clamp:
+     return NVPTXISD::Suld3DI16Clamp;
+   case Intrinsic::nvvm_suld_3d_i32_clamp:
+     return NVPTXISD::Suld3DI32Clamp;
+   case Intrinsic::nvvm_suld_3d_i64_clamp:
+     return NVPTXISD::Suld3DI64Clamp;
+   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+     return NVPTXISD::Suld3DV2I8Clamp;
+   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+     return NVPTXISD::Suld3DV2I16Clamp;
+   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+     return NVPTXISD::Suld3DV2I32Clamp;
+   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+     return NVPTXISD::Suld3DV2I64Clamp;
+   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+     return NVPTXISD::Suld3DV4I8Clamp;
+   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+     return NVPTXISD::Suld3DV4I16Clamp;
+   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+     return NVPTXISD::Suld3DV4I32Clamp;
+   case Intrinsic::nvvm_suld_1d_i8_trap:
+     return NVPTXISD::Suld1DI8Trap;
+   case Intrinsic::nvvm_suld_1d_i16_trap:
+     return NVPTXISD::Suld1DI16Trap;
+   case Intrinsic::nvvm_suld_1d_i32_trap:
+     return NVPTXISD::Suld1DI32Trap;
+   case Intrinsic::nvvm_suld_1d_i64_trap:
+     return NVPTXISD::Suld1DI64Trap;
+   case Intrinsic::nvvm_suld_1d_v2i8_trap:
+     return NVPTXISD::Suld1DV2I8Trap;
+   case Intrinsic::nvvm_suld_1d_v2i16_trap:
+     return NVPTXISD::Suld1DV2I16Trap;
+   case Intrinsic::nvvm_suld_1d_v2i32_trap:
+     return NVPTXISD::Suld1DV2I32Trap;
+   case Intrinsic::nvvm_suld_1d_v2i64_trap:
+     return NVPTXISD::Suld1DV2I64Trap;
+   case Intrinsic::nvvm_suld_1d_v4i8_trap:
+     return NVPTXISD::Suld1DV4I8Trap;
+   case Intrinsic::nvvm_suld_1d_v4i16_trap:
+     return NVPTXISD::Suld1DV4I16Trap;
+   case Intrinsic::nvvm_suld_1d_v4i32_trap:
+     return NVPTXISD::Suld1DV4I32Trap;
+   case Intrinsic::nvvm_suld_1d_array_i8_trap:
+     return NVPTXISD::Suld1DArrayI8Trap;
+   case Intrinsic::nvvm_suld_1d_array_i16_trap:
+     return NVPTXISD::Suld1DArrayI16Trap;
+   case Intrinsic::nvvm_suld_1d_array_i32_trap:
+     return NVPTXISD::Suld1DArrayI32Trap;
+   case Intrinsic::nvvm_suld_1d_array_i64_trap:
+     return NVPTXISD::Suld1DArrayI64Trap;
+   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+     return NVPTXISD::Suld1DArrayV2I8Trap;
+   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+     return NVPTXISD::Suld1DArrayV2I16Trap;
+   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+     return NVPTXISD::Suld1DArrayV2I32Trap;
+   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+     return NVPTXISD::Suld1DArrayV2I64Trap;
+   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+     return NVPTXISD::Suld1DArrayV4I8Trap;
+   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+     return NVPTXISD::Suld1DArrayV4I16Trap;
+   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+     return NVPTXISD::Suld1DArrayV4I32Trap;
+   case Intrinsic::nvvm_suld_2d_i8_trap:
+     return NVPTXISD::Suld2DI8Trap;
+   case Intrinsic::nvvm_suld_2d_i16_trap:
+     return NVPTXISD::Suld2DI16Trap;
+   case Intrinsic::nvvm_suld_2d_i32_trap:
+     return NVPTXISD::Suld2DI32Trap;
+   case Intrinsic::nvvm_suld_2d_i64_trap:
+     return NVPTXISD::Suld2DI64Trap;
+   case Intrinsic::nvvm_suld_2d_v2i8_trap:
+     return NVPTXISD::Suld2DV2I8Trap;
+   case Intrinsic::nvvm_suld_2d_v2i16_trap:
+     return NVPTXISD::Suld2DV2I16Trap;
+   case Intrinsic::nvvm_suld_2d_v2i32_trap:
+     return NVPTXISD::Suld2DV2I32Trap;
+   case Intrinsic::nvvm_suld_2d_v2i64_trap:
+     return NVPTXISD::Suld2DV2I64Trap;
+   case Intrinsic::nvvm_suld_2d_v4i8_trap:
+     return NVPTXISD::Suld2DV4I8Trap;
+   case Intrinsic::nvvm_suld_2d_v4i16_trap:
+     return NVPTXISD::Suld2DV4I16Trap;
+   case Intrinsic::nvvm_suld_2d_v4i32_trap:
+     return NVPTXISD::Suld2DV4I32Trap;
+   case Intrinsic::nvvm_suld_2d_array_i8_trap:
+     return NVPTXISD::Suld2DArrayI8Trap;
+   case Intrinsic::nvvm_suld_2d_array_i16_trap:
+     return NVPTXISD::Suld2DArrayI16Trap;
+   case Intrinsic::nvvm_suld_2d_array_i32_trap:
+     return NVPTXISD::Suld2DArrayI32Trap;
+   case Intrinsic::nvvm_suld_2d_array_i64_trap:
+     return NVPTXISD::Suld2DArrayI64Trap;
+   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+     return NVPTXISD::Suld2DArrayV2I8Trap;
+   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+     return NVPTXISD::Suld2DArrayV2I16Trap;
+   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+     return NVPTXISD::Suld2DArrayV2I32Trap;
+   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+     return NVPTXISD::Suld2DArrayV2I64Trap;
+   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+     return NVPTXISD::Suld2DArrayV4I8Trap;
+   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+     return NVPTXISD::Suld2DArrayV4I16Trap;
+   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+     return NVPTXISD::Suld2DArrayV4I32Trap;
+   case Intrinsic::nvvm_suld_3d_i8_trap:
+     return NVPTXISD::Suld3DI8Trap;
+   case Intrinsic::nvvm_suld_3d_i16_trap:
+     return NVPTXISD::Suld3DI16Trap;
+   case Intrinsic::nvvm_suld_3d_i32_trap:
+     return NVPTXISD::Suld3DI32Trap;
+   case Intrinsic::nvvm_suld_3d_i64_trap:
+     return NVPTXISD::Suld3DI64Trap;
+   case Intrinsic::nvvm_suld_3d_v2i8_trap:
+     return NVPTXISD::Suld3DV2I8Trap;
+   case Intrinsic::nvvm_suld_3d_v2i16_trap:
+     return NVPTXISD::Suld3DV2I16Trap;
+   case Intrinsic::nvvm_suld_3d_v2i32_trap:
+     return NVPTXISD::Suld3DV2I32Trap;
+   case Intrinsic::nvvm_suld_3d_v2i64_trap:
+     return NVPTXISD::Suld3DV2I64Trap;
+   case Intrinsic::nvvm_suld_3d_v4i8_trap:
+     return NVPTXISD::Suld3DV4I8Trap;
+   case Intrinsic::nvvm_suld_3d_v4i16_trap:
+     return NVPTXISD::Suld3DV4I16Trap;
+   case Intrinsic::nvvm_suld_3d_v4i32_trap:
+     return NVPTXISD::Suld3DV4I32Trap;
+   case Intrinsic::nvvm_suld_1d_i8_zero:
+     return NVPTXISD::Suld1DI8Zero;
+   case Intrinsic::nvvm_suld_1d_i16_zero:
+     return NVPTXISD::Suld1DI16Zero;
+   case Intrinsic::nvvm_suld_1d_i32_zero:
+     return NVPTXISD::Suld1DI32Zero;
+   case Intrinsic::nvvm_suld_1d_i64_zero:
+     return NVPTXISD::Suld1DI64Zero;
+   case Intrinsic::nvvm_suld_1d_v2i8_zero:
+     return NVPTXISD::Suld1DV2I8Zero;
+   case Intrinsic::nvvm_suld_1d_v2i16_zero:
+     return NVPTXISD::Suld1DV2I16Zero;
+   case Intrinsic::nvvm_suld_1d_v2i32_zero:
+     return NVPTXISD::Suld1DV2I32Zero;
+   case Intrinsic::nvvm_suld_1d_v2i64_zero:
+     return NVPTXISD::Suld1DV2I64Zero;
+   case Intrinsic::nvvm_suld_1d_v4i8_zero:
+     return NVPTXISD::Suld1DV4I8Zero;
+   case Intrinsic::nvvm_suld_1d_v4i16_zero:
+     return NVPTXISD::Suld1DV4I16Zero;
+   case Intrinsic::nvvm_suld_1d_v4i32_zero:
+     return NVPTXISD::Suld1DV4I32Zero;
+   case Intrinsic::nvvm_suld_1d_array_i8_zero:
+     return NVPTXISD::Suld1DArrayI8Zero;
+   case Intrinsic::nvvm_suld_1d_array_i16_zero:
+     return NVPTXISD::Suld1DArrayI16Zero;
+   case Intrinsic::nvvm_suld_1d_array_i32_zero:
+     return NVPTXISD::Suld1DArrayI32Zero;
+   case Intrinsic::nvvm_suld_1d_array_i64_zero:
+     return NVPTXISD::Suld1DArrayI64Zero;
+   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+     return NVPTXISD::Suld1DArrayV2I8Zero;
+   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+     return NVPTXISD::Suld1DArrayV2I16Zero;
+   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+     return NVPTXISD::Suld1DArrayV2I32Zero;
+   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+     return NVPTXISD::Suld1DArrayV2I64Zero;
+   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+     return NVPTXISD::Suld1DArrayV4I8Zero;
+   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+     return NVPTXISD::Suld1DArrayV4I16Zero;
+   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+     return NVPTXISD::Suld1DArrayV4I32Zero;
+   case Intrinsic::nvvm_suld_2d_i8_zero:
+     return NVPTXISD::Suld2DI8Zero;
+   case Intrinsic::nvvm_suld_2d_i16_zero:
+     return NVPTXISD::Suld2DI16Zero;
+   case Intrinsic::nvvm_suld_2d_i32_zero:
+     return NVPTXISD::Suld2DI32Zero;
+   case Intrinsic::nvvm_suld_2d_i64_zero:
+     return NVPTXISD::Suld2DI64Zero;
+   case Intrinsic::nvvm_suld_2d_v2i8_zero:
+     return NVPTXISD::Suld2DV2I8Zero;
+   case Intrinsic::nvvm_suld_2d_v2i16_zero:
+     return NVPTXISD::Suld2DV2I16Zero;
+   case Intrinsic::nvvm_suld_2d_v2i32_zero:
+     return NVPTXISD::Suld2DV2I32Zero;
+   case Intrinsic::nvvm_suld_2d_v2i64_zero:
+     return NVPTXISD::Suld2DV2I64Zero;
+   case Intrinsic::nvvm_suld_2d_v4i8_zero:
+     return NVPTXISD::Suld2DV4I8Zero;
+   case Intrinsic::nvvm_suld_2d_v4i16_zero:
+     return NVPTXISD::Suld2DV4I16Zero;
+   case Intrinsic::nvvm_suld_2d_v4i32_zero:
+     return NVPTXISD::Suld2DV4I32Zero;
+   case Intrinsic::nvvm_suld_2d_array_i8_zero:
+     return NVPTXISD::Suld2DArrayI8Zero;
+   case Intrinsic::nvvm_suld_2d_array_i16_zero:
+     return NVPTXISD::Suld2DArrayI16Zero;
+   case Intrinsic::nvvm_suld_2d_array_i32_zero:
+     return NVPTXISD::Suld2DArrayI32Zero;
+   case Intrinsic::nvvm_suld_2d_array_i64_zero:
+     return NVPTXISD::Suld2DArrayI64Zero;
+   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+     return NVPTXISD::Suld2DArrayV2I8Zero;
+   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+     return NVPTXISD::Suld2DArrayV2I16Zero;
+   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+     return NVPTXISD::Suld2DArrayV2I32Zero;
+   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+     return NVPTXISD::Suld2DArrayV2I64Zero;
+   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+     return NVPTXISD::Suld2DArrayV4I8Zero;
+   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+     return NVPTXISD::Suld2DArrayV4I16Zero;
+   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+     return NVPTXISD::Suld2DArrayV4I32Zero;
+   case Intrinsic::nvvm_suld_3d_i8_zero:
+     return NVPTXISD::Suld3DI8Zero;
+   case Intrinsic::nvvm_suld_3d_i16_zero:
+     return NVPTXISD::Suld3DI16Zero;
+   case Intrinsic::nvvm_suld_3d_i32_zero:
+     return NVPTXISD::Suld3DI32Zero;
+   case Intrinsic::nvvm_suld_3d_i64_zero:
+     return NVPTXISD::Suld3DI64Zero;
+   case Intrinsic::nvvm_suld_3d_v2i8_zero:
+     return NVPTXISD::Suld3DV2I8Zero;
+   case Intrinsic::nvvm_suld_3d_v2i16_zero:
+     return NVPTXISD::Suld3DV2I16Zero;
+   case Intrinsic::nvvm_suld_3d_v2i32_zero:
+     return NVPTXISD::Suld3DV2I32Zero;
+   case Intrinsic::nvvm_suld_3d_v2i64_zero:
+     return NVPTXISD::Suld3DV2I64Zero;
+   case Intrinsic::nvvm_suld_3d_v4i8_zero:
+     return NVPTXISD::Suld3DV4I8Zero;
+   case Intrinsic::nvvm_suld_3d_v4i16_zero:
+     return NVPTXISD::Suld3DV4I16Zero;
+   case Intrinsic::nvvm_suld_3d_v4i32_zero:
+     return NVPTXISD::Suld3DV4I32Zero;
+   }
+ }
+ 
+ // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
+ // TgtMemIntrinsic
+ // because we need the information that is only available in the "Value" type
+ // of destination
+ // pointer. In particular, the address space information.
+ bool NVPTXTargetLowering::getTgtMemIntrinsic(
+     IntrinsicInfo &Info, const CallInst &I,
+     MachineFunction &MF, unsigned Intrinsic) const {
+   switch (Intrinsic) {
+   default:
+     return false;
+   case Intrinsic::nvvm_match_all_sync_i32p:
+   case Intrinsic::nvvm_match_all_sync_i64p:
+     Info.opc = ISD::INTRINSIC_W_CHAIN;
+     // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
+     // in order to model data exchange with other threads, but perform no real
+     // memory accesses.
+     Info.memVT = MVT::i1;
+ 
+     // Our result depends on both our and other thread's arguments.
+     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+     return true;
+   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
+     Info.opc = ISD::INTRINSIC_W_CHAIN;
+     Info.memVT = MVT::v8f16;
+     Info.ptrVal = I.getArgOperand(0);
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad;
+     Info.align = 16;
+     return true;
+   }
+ 
+   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
+     Info.opc = ISD::INTRINSIC_W_CHAIN;
+     Info.memVT = MVT::v4f16;
+     Info.ptrVal = I.getArgOperand(0);
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad;
+     Info.align = 16;
+     return true;
+   }
+ 
+   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
+   case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
+     Info.opc = ISD::INTRINSIC_W_CHAIN;
+     Info.memVT = MVT::v8f32;
+     Info.ptrVal = I.getArgOperand(0);
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad;
+     Info.align = 16;
+     return true;
+   }
+ 
+   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
+   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
+   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
+   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
+   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
+   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
+   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
+     Info.opc = ISD::INTRINSIC_VOID;
+     Info.memVT = MVT::v4f16;
+     Info.ptrVal = I.getArgOperand(0);
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOStore;
+     Info.align = 16;
+     return true;
+   }
+ 
+   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
+   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
+   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
+   case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
+   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
+   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
+   case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
+   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
+   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
+   case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
+     Info.opc = ISD::INTRINSIC_VOID;
+     Info.memVT = MVT::v8f32;
+     Info.ptrVal = I.getArgOperand(0);
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOStore;
+     Info.align = 16;
+     return true;
+   }
+ 
+   case Intrinsic::nvvm_atomic_load_add_f32:
+   case Intrinsic::nvvm_atomic_load_add_f64:
+   case Intrinsic::nvvm_atomic_load_inc_32:
+   case Intrinsic::nvvm_atomic_load_dec_32:
+ 
+   case Intrinsic::nvvm_atomic_add_gen_f_cta:
+   case Intrinsic::nvvm_atomic_add_gen_f_sys:
+   case Intrinsic::nvvm_atomic_add_gen_i_cta:
+   case Intrinsic::nvvm_atomic_add_gen_i_sys:
+   case Intrinsic::nvvm_atomic_and_gen_i_cta:
+   case Intrinsic::nvvm_atomic_and_gen_i_sys:
+   case Intrinsic::nvvm_atomic_cas_gen_i_cta:
+   case Intrinsic::nvvm_atomic_cas_gen_i_sys:
+   case Intrinsic::nvvm_atomic_dec_gen_i_cta:
+   case Intrinsic::nvvm_atomic_dec_gen_i_sys:
+   case Intrinsic::nvvm_atomic_inc_gen_i_cta:
+   case Intrinsic::nvvm_atomic_inc_gen_i_sys:
+   case Intrinsic::nvvm_atomic_max_gen_i_cta:
+   case Intrinsic::nvvm_atomic_max_gen_i_sys:
+   case Intrinsic::nvvm_atomic_min_gen_i_cta:
+   case Intrinsic::nvvm_atomic_min_gen_i_sys:
+   case Intrinsic::nvvm_atomic_or_gen_i_cta:
+   case Intrinsic::nvvm_atomic_or_gen_i_sys:
+   case Intrinsic::nvvm_atomic_exch_gen_i_cta:
+   case Intrinsic::nvvm_atomic_exch_gen_i_sys:
+   case Intrinsic::nvvm_atomic_xor_gen_i_cta:
+   case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
+     auto &DL = I.getModule()->getDataLayout();
+     Info.opc = ISD::INTRINSIC_W_CHAIN;
+     Info.memVT = getValueType(DL, I.getType());
+     Info.ptrVal = I.getArgOperand(0);
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+     Info.align = 0;
+     return true;
+   }
+ 
+   case Intrinsic::nvvm_ldu_global_i:
+   case Intrinsic::nvvm_ldu_global_f:
+   case Intrinsic::nvvm_ldu_global_p: {
+     auto &DL = I.getModule()->getDataLayout();
+     Info.opc = ISD::INTRINSIC_W_CHAIN;
+     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
+       Info.memVT = getValueType(DL, I.getType());
+     else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
+       Info.memVT = getPointerTy(DL);
+     else
+       Info.memVT = getValueType(DL, I.getType());
+     Info.ptrVal = I.getArgOperand(0);
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad;
+     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+ 
+     return true;
+   }
+   case Intrinsic::nvvm_ldg_global_i:
+   case Intrinsic::nvvm_ldg_global_f:
+   case Intrinsic::nvvm_ldg_global_p: {
+     auto &DL = I.getModule()->getDataLayout();
+ 
+     Info.opc = ISD::INTRINSIC_W_CHAIN;
+     if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
+       Info.memVT = getValueType(DL, I.getType());
+     else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
+       Info.memVT = getPointerTy(DL);
+     else
+       Info.memVT = getValueType(DL, I.getType());
+     Info.ptrVal = I.getArgOperand(0);
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad;
+     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+ 
+     return true;
+   }
+ 
+   case Intrinsic::nvvm_tex_1d_v4f32_s32:
+   case Intrinsic::nvvm_tex_1d_v4f32_f32:
+   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+   case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+   case Intrinsic::nvvm_tex_2d_v4f32_s32:
+   case Intrinsic::nvvm_tex_2d_v4f32_f32:
+   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+   case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+   case Intrinsic::nvvm_tex_3d_v4f32_s32:
+   case Intrinsic::nvvm_tex_3d_v4f32_f32:
+   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+   case Intrinsic::nvvm_tex_cube_v4f32_f32:
+   case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+   case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+   case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+   case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+   case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+   case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+   case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+   case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+   case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+   case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+   case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+   case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
+     Info.opc = getOpcForTextureInstr(Intrinsic);
+     Info.memVT = MVT::v4f32;
+     Info.ptrVal = nullptr;
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad;
+     Info.align = 16;
+     return true;
+ 
+   case Intrinsic::nvvm_tex_1d_v4s32_s32:
+   case Intrinsic::nvvm_tex_1d_v4s32_f32:
+   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+   case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+   case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+   case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+   case Intrinsic::nvvm_tex_2d_v4s32_s32:
+   case Intrinsic::nvvm_tex_2d_v4s32_f32:
+   case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+   case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+   case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+   case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+   case Intrinsic::nvvm_tex_3d_v4s32_s32:
+   case Intrinsic::nvvm_tex_3d_v4s32_f32:
+   case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+   case Intrinsic::nvvm_tex_cube_v4s32_f32:
+   case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+   case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_cube_v4u32_f32:
+   case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+   case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_1d_v4u32_s32:
+   case Intrinsic::nvvm_tex_1d_v4u32_f32:
+   case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+   case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+   case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+   case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+   case Intrinsic::nvvm_tex_2d_v4u32_s32:
+   case Intrinsic::nvvm_tex_2d_v4u32_f32:
+   case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+   case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+   case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+   case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+   case Intrinsic::nvvm_tex_3d_v4u32_s32:
+   case Intrinsic::nvvm_tex_3d_v4u32_f32:
+   case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+   case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+   case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+   case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+   case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+   case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+   case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+   case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+   case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+   case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+   case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+   case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+   case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+   case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+   case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+   case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+   case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+   case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+   case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+   case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+   case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+   case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+   case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+   case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+   case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
+     Info.opc = getOpcForTextureInstr(Intrinsic);
+     Info.memVT = MVT::v4i32;
+     Info.ptrVal = nullptr;
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad;
+     Info.align = 16;
+     return true;
+ 
+   case Intrinsic::nvvm_suld_1d_i8_clamp:
+   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+   case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+   case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+   case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+   case Intrinsic::nvvm_suld_2d_i8_clamp:
+   case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+   case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+   case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+   case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+   case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+   case Intrinsic::nvvm_suld_3d_i8_clamp:
+   case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+   case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+   case Intrinsic::nvvm_suld_1d_i8_trap:
+   case Intrinsic::nvvm_suld_1d_v2i8_trap:
+   case Intrinsic::nvvm_suld_1d_v4i8_trap:
+   case Intrinsic::nvvm_suld_1d_array_i8_trap:
+   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+   case Intrinsic::nvvm_suld_2d_i8_trap:
+   case Intrinsic::nvvm_suld_2d_v2i8_trap:
+   case Intrinsic::nvvm_suld_2d_v4i8_trap:
+   case Intrinsic::nvvm_suld_2d_array_i8_trap:
+   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+   case Intrinsic::nvvm_suld_3d_i8_trap:
+   case Intrinsic::nvvm_suld_3d_v2i8_trap:
+   case Intrinsic::nvvm_suld_3d_v4i8_trap:
+   case Intrinsic::nvvm_suld_1d_i8_zero:
+   case Intrinsic::nvvm_suld_1d_v2i8_zero:
+   case Intrinsic::nvvm_suld_1d_v4i8_zero:
+   case Intrinsic::nvvm_suld_1d_array_i8_zero:
+   case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+   case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+   case Intrinsic::nvvm_suld_2d_i8_zero:
+   case Intrinsic::nvvm_suld_2d_v2i8_zero:
+   case Intrinsic::nvvm_suld_2d_v4i8_zero:
+   case Intrinsic::nvvm_suld_2d_array_i8_zero:
+   case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+   case Intrinsic::nvvm_suld_3d_i8_zero:
+   case Intrinsic::nvvm_suld_3d_v2i8_zero:
+   case Intrinsic::nvvm_suld_3d_v4i8_zero:
+     Info.opc = getOpcForSurfaceInstr(Intrinsic);
+     Info.memVT = MVT::i8;
+     Info.ptrVal = nullptr;
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad;
+     Info.align = 16;
+     return true;
+ 
+   case Intrinsic::nvvm_suld_1d_i16_clamp:
+   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+   case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+   case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+   case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+   case Intrinsic::nvvm_suld_2d_i16_clamp:
+   case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+   case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+   case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+   case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+   case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+   case Intrinsic::nvvm_suld_3d_i16_clamp:
+   case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+   case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+   case Intrinsic::nvvm_suld_1d_i16_trap:
+   case Intrinsic::nvvm_suld_1d_v2i16_trap:
+   case Intrinsic::nvvm_suld_1d_v4i16_trap:
+   case Intrinsic::nvvm_suld_1d_array_i16_trap:
+   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+   case Intrinsic::nvvm_suld_2d_i16_trap:
+   case Intrinsic::nvvm_suld_2d_v2i16_trap:
+   case Intrinsic::nvvm_suld_2d_v4i16_trap:
+   case Intrinsic::nvvm_suld_2d_array_i16_trap:
+   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+   case Intrinsic::nvvm_suld_3d_i16_trap:
+   case Intrinsic::nvvm_suld_3d_v2i16_trap:
+   case Intrinsic::nvvm_suld_3d_v4i16_trap:
+   case Intrinsic::nvvm_suld_1d_i16_zero:
+   case Intrinsic::nvvm_suld_1d_v2i16_zero:
+   case Intrinsic::nvvm_suld_1d_v4i16_zero:
+   case Intrinsic::nvvm_suld_1d_array_i16_zero:
+   case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+   case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+   case Intrinsic::nvvm_suld_2d_i16_zero:
+   case Intrinsic::nvvm_suld_2d_v2i16_zero:
+   case Intrinsic::nvvm_suld_2d_v4i16_zero:
+   case Intrinsic::nvvm_suld_2d_array_i16_zero:
+   case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+   case Intrinsic::nvvm_suld_3d_i16_zero:
+   case Intrinsic::nvvm_suld_3d_v2i16_zero:
+   case Intrinsic::nvvm_suld_3d_v4i16_zero:
+     Info.opc = getOpcForSurfaceInstr(Intrinsic);
+     Info.memVT = MVT::i16;
+     Info.ptrVal = nullptr;
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad;
+     Info.align = 16;
+     return true;
+ 
+   case Intrinsic::nvvm_suld_1d_i32_clamp:
+   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+   case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+   case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+   case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+   case Intrinsic::nvvm_suld_2d_i32_clamp:
+   case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+   case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+   case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+   case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+   case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+   case Intrinsic::nvvm_suld_3d_i32_clamp:
+   case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+   case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+   case Intrinsic::nvvm_suld_1d_i32_trap:
+   case Intrinsic::nvvm_suld_1d_v2i32_trap:
+   case Intrinsic::nvvm_suld_1d_v4i32_trap:
+   case Intrinsic::nvvm_suld_1d_array_i32_trap:
+   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+   case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+   case Intrinsic::nvvm_suld_2d_i32_trap:
+   case Intrinsic::nvvm_suld_2d_v2i32_trap:
+   case Intrinsic::nvvm_suld_2d_v4i32_trap:
+   case Intrinsic::nvvm_suld_2d_array_i32_trap:
+   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+   case Intrinsic::nvvm_suld_3d_i32_trap:
+   case Intrinsic::nvvm_suld_3d_v2i32_trap:
+   case Intrinsic::nvvm_suld_3d_v4i32_trap:
+   case Intrinsic::nvvm_suld_1d_i32_zero:
+   case Intrinsic::nvvm_suld_1d_v2i32_zero:
+   case Intrinsic::nvvm_suld_1d_v4i32_zero:
+   case Intrinsic::nvvm_suld_1d_array_i32_zero:
+   case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+   case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+   case Intrinsic::nvvm_suld_2d_i32_zero:
+   case Intrinsic::nvvm_suld_2d_v2i32_zero:
+   case Intrinsic::nvvm_suld_2d_v4i32_zero:
+   case Intrinsic::nvvm_suld_2d_array_i32_zero:
+   case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+   case Intrinsic::nvvm_suld_3d_i32_zero:
+   case Intrinsic::nvvm_suld_3d_v2i32_zero:
+   case Intrinsic::nvvm_suld_3d_v4i32_zero:
+     Info.opc = getOpcForSurfaceInstr(Intrinsic);
+     Info.memVT = MVT::i32;
+     Info.ptrVal = nullptr;
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad;
+     Info.align = 16;
+     return true;
+ 
+   case Intrinsic::nvvm_suld_1d_i64_clamp:
+   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+   case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+   case Intrinsic::nvvm_suld_2d_i64_clamp:
+   case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+   case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+   case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+   case Intrinsic::nvvm_suld_3d_i64_clamp:
+   case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+   case Intrinsic::nvvm_suld_1d_i64_trap:
+   case Intrinsic::nvvm_suld_1d_v2i64_trap:
+   case Intrinsic::nvvm_suld_1d_array_i64_trap:
+   case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+   case Intrinsic::nvvm_suld_2d_i64_trap:
+   case Intrinsic::nvvm_suld_2d_v2i64_trap:
+   case Intrinsic::nvvm_suld_2d_array_i64_trap:
+   case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+   case Intrinsic::nvvm_suld_3d_i64_trap:
+   case Intrinsic::nvvm_suld_3d_v2i64_trap:
+   case Intrinsic::nvvm_suld_1d_i64_zero:
+   case Intrinsic::nvvm_suld_1d_v2i64_zero:
+   case Intrinsic::nvvm_suld_1d_array_i64_zero:
+   case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+   case Intrinsic::nvvm_suld_2d_i64_zero:
+   case Intrinsic::nvvm_suld_2d_v2i64_zero:
+   case Intrinsic::nvvm_suld_2d_array_i64_zero:
+   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+   case Intrinsic::nvvm_suld_3d_i64_zero:
+   case Intrinsic::nvvm_suld_3d_v2i64_zero:
+     Info.opc = getOpcForSurfaceInstr(Intrinsic);
+     Info.memVT = MVT::i64;
+     Info.ptrVal = nullptr;
+     Info.offset = 0;
+     Info.flags = MachineMemOperand::MOLoad;
+     Info.align = 16;
+     return true;
+   }
+   return false;
+ }
+ 
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ /// Used to guide target specific optimizations, like loop strength reduction
+ /// (LoopStrengthReduce.cpp) and memory optimization for address mode
+ /// (CodeGenPrepare.cpp)
+ bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                 const AddrMode &AM, Type *Ty,
+                                                 unsigned AS, Instruction *I) const {
+   // AddrMode - This represents an addressing mode of:
+   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
+   //
+   // The legal address modes are
+   // - [avar]
+   // - [areg]
+   // - [areg+immoff]
+   // - [immAddr]
+ 
+   if (AM.BaseGV) {
+     return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
+   }
+ 
+   switch (AM.Scale) {
+   case 0: // "r", "r+i" or "i" is allowed
+     break;
+   case 1:
+     if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
+       return false;
+     // Otherwise we have r+i.
+     break;
+   default:
+     // No scale > 1 is allowed
+     return false;
+   }
+   return true;
+ }
+ 
+ //===----------------------------------------------------------------------===//
+ //                         NVPTX Inline Assembly Support
+ //===----------------------------------------------------------------------===//
+ 
+ /// getConstraintType - Given a constraint letter, return the type of
+ /// constraint it is for this target.
+ NVPTXTargetLowering::ConstraintType
+ NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
+   if (Constraint.size() == 1) {
+     switch (Constraint[0]) {
+     default:
+       break;
+     case 'b':
+     case 'r':
+     case 'h':
+     case 'c':
+     case 'l':
+     case 'f':
+     case 'd':
+     case '0':
+     case 'N':
+       return C_RegisterClass;
+     }
+   }
+   return TargetLowering::getConstraintType(Constraint);
+ }
+ 
+ std::pair<unsigned, const TargetRegisterClass *>
+ NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                   StringRef Constraint,
+                                                   MVT VT) const {
+   if (Constraint.size() == 1) {
+     switch (Constraint[0]) {
+     case 'b':
+       return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
+     case 'c':
+       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+     case 'h':
+       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+     case 'r':
+       return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
+     case 'l':
+     case 'N':
+       return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
+     case 'f':
+       return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
+     case 'd':
+       return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
+     }
+   }
+   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+ }
+ 
+ //===----------------------------------------------------------------------===//
+ //                         NVPTX DAG Combining
+ //===----------------------------------------------------------------------===//
+ 
+ bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
+                                    CodeGenOpt::Level OptLevel) const {
+   // Always honor command-line argument
+   if (FMAContractLevelOpt.getNumOccurrences() > 0)
+     return FMAContractLevelOpt > 0;
+ 
+   // Do not contract if we're not optimizing the code.
+   if (OptLevel == 0)
+     return false;
+ 
+   // Honor TargetOptions flags that explicitly say fusion is okay.
+   if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
+     return true;
+ 
+   return allowUnsafeFPMath(MF);
+ }
+ 
+ bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
+   // Honor TargetOptions flags that explicitly say unsafe math is okay.
+   if (MF.getTarget().Options.UnsafeFPMath)
+     return true;
+ 
+   // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
+   const Function &F = MF.getFunction();
+   if (F.hasFnAttribute("unsafe-fp-math")) {
+     Attribute Attr = F.getFnAttribute("unsafe-fp-math");
+     StringRef Val = Attr.getValueAsString();
+     if (Val == "true")
+       return true;
+   }
+ 
+   return false;
+ }
+ 
+ /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+ /// operands N0 and N1.  This is a helper for PerformADDCombine that is
+ /// called with the default operands, and if that fails, with commuted
+ /// operands.
+ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                                            TargetLowering::DAGCombinerInfo &DCI,
+                                              const NVPTXSubtarget &Subtarget,
+                                              CodeGenOpt::Level OptLevel) {
+   SelectionDAG  &DAG = DCI.DAG;
+   // Skip non-integer, non-scalar case
+   EVT VT=N0.getValueType();
+   if (VT.isVector())
+     return SDValue();
+ 
+   // fold (add (mul a, b), c) -> (mad a, b, c)
+   //
+   if (N0.getOpcode() == ISD::MUL) {
+     assert (VT.isInteger());
+     // For integer:
+     // Since integer multiply-add costs the same as integer multiply
+     // but is more costly than integer add, do the fusion only when
+     // the mul is only used in the add.
+     if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
+         !N0.getNode()->hasOneUse())
+       return SDValue();
+ 
+     // Do the folding
+     return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
+                        N0.getOperand(0), N0.getOperand(1), N1);
+   }
+   else if (N0.getOpcode() == ISD::FMUL) {
+     if (VT == MVT::f32 || VT == MVT::f64) {
+       const auto *TLI = static_cast<const NVPTXTargetLowering *>(
+           &DAG.getTargetLoweringInfo());
+       if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
+         return SDValue();
+ 
+       // For floating point:
+       // Do the fusion only when the mul has less than 5 uses and all
+       // are add.
+       // The heuristic is that if a use is not an add, then that use
+       // cannot be fused into fma, therefore mul is still needed anyway.
+       // If there are more than 4 uses, even if they are all add, fusing
+       // them will increase register pressue.
+       //
+       int numUses = 0;
+       int nonAddCount = 0;
+       for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+            UE = N0.getNode()->use_end();
+            UI != UE; ++UI) {
+         numUses++;
+         SDNode *User = *UI;
+         if (User->getOpcode() != ISD::FADD)
+           ++nonAddCount;
+       }
+       if (numUses >= 5)
+         return SDValue();
+       if (nonAddCount) {
+         int orderNo = N->getIROrder();
+         int orderNo2 = N0.getNode()->getIROrder();
+         // simple heuristics here for considering potential register
+         // pressure, the logics here is that the differnce are used
+         // to measure the distance between def and use, the longer distance
+         // more likely cause register pressure.
+         if (orderNo - orderNo2 < 500)
+           return SDValue();
+ 
+         // Now, check if at least one of the FMUL's operands is live beyond the node N,
+         // which guarantees that the FMA will not increase register pressure at node N.
+         bool opIsLive = false;
+         const SDNode *left = N0.getOperand(0).getNode();
+         const SDNode *right = N0.getOperand(1).getNode();
+ 
+         if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
+           opIsLive = true;
+ 
+         if (!opIsLive)
+           for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
+             SDNode *User = *UI;
+             int orderNo3 = User->getIROrder();
+             if (orderNo3 > orderNo) {
+               opIsLive = true;
+               break;
+             }
+           }
+ 
+         if (!opIsLive)
+           for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
+             SDNode *User = *UI;
+             int orderNo3 = User->getIROrder();
+             if (orderNo3 > orderNo) {
+               opIsLive = true;
+               break;
+             }
+           }
+ 
+         if (!opIsLive)
+           return SDValue();
+       }
+ 
+       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                          N0.getOperand(0), N0.getOperand(1), N1);
+     }
+   }
+ 
+   return SDValue();
+ }
+ 
+ /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+ ///
+ static SDValue PerformADDCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const NVPTXSubtarget &Subtarget,
+                                  CodeGenOpt::Level OptLevel) {
+   SDValue N0 = N->getOperand(0);
+   SDValue N1 = N->getOperand(1);
+ 
+   // First try with the default operand order.
+   if (SDValue Result =
+           PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
+     return Result;
+ 
+   // If that didn't work, try again with the operands commuted.
+   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
+ }
+ 
+ static SDValue PerformANDCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+   // The type legalizer turns a vector load of i8 values into a zextload to i16
+   // registers, optionally ANY_EXTENDs it (if target type is integer),
+   // and ANDs off the high 8 bits. Since we turn this load into a
+   // target-specific DAG node, the DAG combiner fails to eliminate these AND
+   // nodes. Do that here.
+   SDValue Val = N->getOperand(0);
+   SDValue Mask = N->getOperand(1);
+ 
+   if (isa<ConstantSDNode>(Val)) {
+     std::swap(Val, Mask);
+   }
+ 
+   SDValue AExt;
+   // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
+   if (Val.getOpcode() == ISD::ANY_EXTEND) {
+     AExt = Val;
+     Val = Val->getOperand(0);
+   }
+ 
+   if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
+     Val = Val->getOperand(0);
+   }
+ 
+   if (Val->getOpcode() == NVPTXISD::LoadV2 ||
+       Val->getOpcode() == NVPTXISD::LoadV4) {
+     ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
+     if (!MaskCnst) {
+       // Not an AND with a constant
+       return SDValue();
+     }
+ 
+     uint64_t MaskVal = MaskCnst->getZExtValue();
+     if (MaskVal != 0xff) {
+       // Not an AND that chops off top 8 bits
+       return SDValue();
+     }
+ 
+     MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
+     if (!Mem) {
+       // Not a MemSDNode?!?
+       return SDValue();
+     }
+ 
+     EVT MemVT = Mem->getMemoryVT();
+     if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
+       // We only handle the i8 case
+       return SDValue();
+     }
+ 
+     unsigned ExtType =
+       cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
+         getZExtValue();
+     if (ExtType == ISD::SEXTLOAD) {
+       // If for some reason the load is a sextload, the and is needed to zero
+       // out the high 8 bits
+       return SDValue();
+     }
+ 
+     bool AddTo = false;
+     if (AExt.getNode() != nullptr) {
+       // Re-insert the ext as a zext.
+       Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
+                             AExt.getValueType(), Val);
+       AddTo = true;
+     }
+ 
+     // If we get here, the AND is unnecessary.  Just replace it with the load
+     DCI.CombineTo(N, Val, AddTo);
+   }
+ 
+   return SDValue();
+ }
+ 
+ static SDValue PerformREMCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  CodeGenOpt::Level OptLevel) {
+   assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
+ 
+   // Don't do anything at less than -O2.
+   if (OptLevel < CodeGenOpt::Default)
+     return SDValue();
+ 
+   SelectionDAG &DAG = DCI.DAG;
+   SDLoc DL(N);
+   EVT VT = N->getValueType(0);
+   bool IsSigned = N->getOpcode() == ISD::SREM;
+   unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
+ 
+   const SDValue &Num = N->getOperand(0);
+   const SDValue &Den = N->getOperand(1);
+ 
+   for (const SDNode *U : Num->uses()) {
+     if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
+         U->getOperand(1) == Den) {
+       // Num % Den -> Num - (Num / Den) * Den
+       return DAG.getNode(ISD::SUB, DL, VT, Num,
+                          DAG.getNode(ISD::MUL, DL, VT,
+                                      DAG.getNode(DivOpc, DL, VT, Num, Den),
+                                      Den));
+     }
+   }
+   return SDValue();
+ }
+ 
+ enum OperandSignedness {
+   Signed = 0,
+   Unsigned,
+   Unknown
+ };
+ 
+ /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
+ /// that can be demoted to \p OptSize bits without loss of information. The
+ /// signedness of the operand, if determinable, is placed in \p S.
+ static bool IsMulWideOperandDemotable(SDValue Op,
+                                       unsigned OptSize,
+                                       OperandSignedness &S) {
+   S = Unknown;
+ 
+   if (Op.getOpcode() == ISD::SIGN_EXTEND ||
+       Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+     EVT OrigVT = Op.getOperand(0).getValueType();
+     if (OrigVT.getSizeInBits() <= OptSize) {
+       S = Signed;
+       return true;
+     }
+   } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
+     EVT OrigVT = Op.getOperand(0).getValueType();
+     if (OrigVT.getSizeInBits() <= OptSize) {
+       S = Unsigned;
+       return true;
+     }
+   }
+ 
+   return false;
+ }
+ 
+ /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
+ /// be demoted to \p OptSize bits without loss of information. If the operands
+ /// contain a constant, it should appear as the RHS operand. The signedness of
+ /// the operands is placed in \p IsSigned.
+ static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
+                                         unsigned OptSize,
+                                         bool &IsSigned) {
+   OperandSignedness LHSSign;
+ 
+   // The LHS operand must be a demotable op
+   if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
+     return false;
+ 
+   // We should have been able to determine the signedness from the LHS
+   if (LHSSign == Unknown)
+     return false;
+ 
+   IsSigned = (LHSSign == Signed);
+ 
+   // The RHS can be a demotable op or a constant
+   if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
+     const APInt &Val = CI->getAPIntValue();
+     if (LHSSign == Unsigned) {
+       return Val.isIntN(OptSize);
+     } else {
+       return Val.isSignedIntN(OptSize);
+     }
+   } else {
+     OperandSignedness RHSSign;
+     if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
+       return false;
+ 
+     return LHSSign == RHSSign;
+   }
+ }
+ 
+ /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
+ /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
+ /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
+ /// amount.
+ static SDValue TryMULWIDECombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+   EVT MulType = N->getValueType(0);
+   if (MulType != MVT::i32 && MulType != MVT::i64) {
+     return SDValue();
+   }
+ 
+   SDLoc DL(N);
+   unsigned OptSize = MulType.getSizeInBits() >> 1;
+   SDValue LHS = N->getOperand(0);
+   SDValue RHS = N->getOperand(1);
+ 
+   // Canonicalize the multiply so the constant (if any) is on the right
+   if (N->getOpcode() == ISD::MUL) {
+     if (isa<ConstantSDNode>(LHS)) {
+       std::swap(LHS, RHS);
+     }
+   }
+ 
+   // If we have a SHL, determine the actual multiply amount
+   if (N->getOpcode() == ISD::SHL) {
+     ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
+     if (!ShlRHS) {
+       return SDValue();
+     }
+ 
+     APInt ShiftAmt = ShlRHS->getAPIntValue();
+     unsigned BitWidth = MulType.getSizeInBits();
+     if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
+       APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
+       RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
+     } else {
+       return SDValue();
+     }
+   }
+ 
+   bool Signed;
+   // Verify that our operands are demotable
+   if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
+     return SDValue();
+   }
+ 
+   EVT DemotedVT;
+   if (MulType == MVT::i32) {
+     DemotedVT = MVT::i16;
+   } else {
+     DemotedVT = MVT::i32;
+   }
+ 
+   // Truncate the operands to the correct size. Note that these are just for
+   // type consistency and will (likely) be eliminated in later phases.
+   SDValue TruncLHS =
+     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
+   SDValue TruncRHS =
+     DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
+ 
+   unsigned Opc;
+   if (Signed) {
+     Opc = NVPTXISD::MUL_WIDE_SIGNED;
+   } else {
+     Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
+   }
+ 
+   return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
+ }
+ 
+ /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
+ static SDValue PerformMULCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  CodeGenOpt::Level OptLevel) {
+   if (OptLevel > 0) {
+     // Try mul.wide combining at OptLevel > 0
+     if (SDValue Ret = TryMULWIDECombine(N, DCI))
+       return Ret;
+   }
+ 
+   return SDValue();
+ }
+ 
+ /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
+ static SDValue PerformSHLCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  CodeGenOpt::Level OptLevel) {
+   if (OptLevel > 0) {
+     // Try mul.wide combining at OptLevel > 0
+     if (SDValue Ret = TryMULWIDECombine(N, DCI))
+       return Ret;
+   }
+ 
+   return SDValue();
+ }
+ 
+ static SDValue PerformSETCCCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI) {
+   EVT CCType = N->getValueType(0);
+   SDValue A = N->getOperand(0);
+   SDValue B = N->getOperand(1);
+ 
+   if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
+     return SDValue();
+ 
+   SDLoc DL(N);
+   // setp.f16x2 returns two scalar predicates, which we need to
+   // convert back to v2i1. The returned result will be scalarized by
+   // the legalizer, but the comparison will remain a single vector
+   // instruction.
+   SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
+                                    DCI.DAG.getVTList(MVT::i1, MVT::i1),
+                                    {A, B, N->getOperand(2)});
+   return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
+                          CCNode.getValue(1));
+ }
+ 
+ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+   CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
+   switch (N->getOpcode()) {
+     default: break;
+     case ISD::ADD:
+     case ISD::FADD:
+       return PerformADDCombine(N, DCI, STI, OptLevel);
+     case ISD::MUL:
+       return PerformMULCombine(N, DCI, OptLevel);
+     case ISD::SHL:
+       return PerformSHLCombine(N, DCI, OptLevel);
+     case ISD::AND:
+       return PerformANDCombine(N, DCI);
+     case ISD::UREM:
+     case ISD::SREM:
+       return PerformREMCombine(N, DCI, OptLevel);
+     case ISD::SETCC:
+       return PerformSETCCCombine(N, DCI);
+   }
+   return SDValue();
+ }
+ 
+ /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
+ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &Results) {
+   EVT ResVT = N->getValueType(0);
+   SDLoc DL(N);
+ 
+   assert(ResVT.isVector() && "Vector load must have vector type");
+ 
+   // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+   // legal.  We can (and should) split that into 2 loads of <2 x double> here
+   // but I'm leaving that as a TODO for now.
+   assert(ResVT.isSimple() && "Can only handle simple types");
+   switch (ResVT.getSimpleVT().SimpleTy) {
+   default:
+     return;
+   case MVT::v2i8:
+   case MVT::v2i16:
+   case MVT::v2i32:
+   case MVT::v2i64:
+   case MVT::v2f16:
+   case MVT::v2f32:
+   case MVT::v2f64:
+   case MVT::v4i8:
+   case MVT::v4i16:
+   case MVT::v4i32:
+   case MVT::v4f16:
+   case MVT::v4f32:
+   case MVT::v8f16: // <4 x f16x2>
+     // This is a "native" vector type
+     break;
+   }
+ 
+   LoadSDNode *LD = cast<LoadSDNode>(N);
+ 
+   unsigned Align = LD->getAlignment();
+   auto &TD = DAG.getDataLayout();
+   unsigned PrefAlign =
+       TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
+   if (Align < PrefAlign) {
+     // This load is not sufficiently aligned, so bail out and let this vector
+     // load be scalarized.  Note that we may still be able to emit smaller
+     // vector loads.  For example, if we are loading a <4 x float> with an
+     // alignment of 8, this check will fail but the legalizer will try again
+     // with 2 x <2 x float>, which will succeed with an alignment of 8.
+     return;
+   }
+ 
+   EVT EltVT = ResVT.getVectorElementType();
+   unsigned NumElts = ResVT.getVectorNumElements();
+ 
+   // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+   // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+   // loaded type to i16 and propagate the "real" type as the memory type.
+   bool NeedTrunc = false;
+   if (EltVT.getSizeInBits() < 16) {
+     EltVT = MVT::i16;
+     NeedTrunc = true;
+   }
+ 
+   unsigned Opcode = 0;
+   SDVTList LdResVTs;
+   bool LoadF16x2 = false;
+ 
+   switch (NumElts) {
+   default:
+     return;
+   case 2:
+     Opcode = NVPTXISD::LoadV2;
+     LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+     break;
+   case 4: {
+     Opcode = NVPTXISD::LoadV4;
+     EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+     LdResVTs = DAG.getVTList(ListVTs);
+     break;
+   }
+   case 8: {
+     // v8f16 is a special case. PTX doesn't have ld.v8.f16
+     // instruction. Instead, we split the vector into v2f16 chunks and
+     // load them with ld.v4.b32.
+     assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
+     LoadF16x2 = true;
+     Opcode = NVPTXISD::LoadV4;
+     EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
+                      MVT::Other};
+     LdResVTs = DAG.getVTList(ListVTs);
+     break;
+   }
+   }
+ 
+   // Copy regular operands
+   SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
+ 
+   // The select routine does not have access to the LoadSDNode instance, so
+   // pass along the extension information
+   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
+ 
+   SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                           LD->getMemoryVT(),
+                                           LD->getMemOperand());
+ 
+   SmallVector<SDValue, 8> ScalarRes;
+   if (LoadF16x2) {
+     // Split v2f16 subvectors back into individual elements.
+     NumElts /= 2;
+     for (unsigned i = 0; i < NumElts; ++i) {
+       SDValue SubVector = NewLD.getValue(i);
+       SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
+                                DAG.getIntPtrConstant(0, DL));
+       SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
+                                DAG.getIntPtrConstant(1, DL));
+       ScalarRes.push_back(E0);
+       ScalarRes.push_back(E1);
+     }
+   } else {
+     for (unsigned i = 0; i < NumElts; ++i) {
+       SDValue Res = NewLD.getValue(i);
+       if (NeedTrunc)
+         Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+       ScalarRes.push_back(Res);
+     }
+   }
+ 
+   SDValue LoadChain = NewLD.getValue(NumElts);
+ 
+   SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
+ 
+   Results.push_back(BuildVec);
+   Results.push_back(LoadChain);
+ }
+ 
+ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &Results) {
+   SDValue Chain = N->getOperand(0);
+   SDValue Intrin = N->getOperand(1);
+   SDLoc DL(N);
+ 
+   // Get the intrinsic ID
+   unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
+   switch (IntrinNo) {
+   default:
+     return;
+   case Intrinsic::nvvm_ldg_global_i:
+   case Intrinsic::nvvm_ldg_global_f:
+   case Intrinsic::nvvm_ldg_global_p:
+   case Intrinsic::nvvm_ldu_global_i:
+   case Intrinsic::nvvm_ldu_global_f:
+   case Intrinsic::nvvm_ldu_global_p: {
+     EVT ResVT = N->getValueType(0);
+ 
+     if (ResVT.isVector()) {
+       // Vector LDG/LDU
+ 
+       unsigned NumElts = ResVT.getVectorNumElements();
+       EVT EltVT = ResVT.getVectorElementType();
+ 
+       // Since LDU/LDG are target nodes, we cannot rely on DAG type
+       // legalization.
+       // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+       // loaded type to i16 and propagate the "real" type as the memory type.
+       bool NeedTrunc = false;
+       if (EltVT.getSizeInBits() < 16) {
+         EltVT = MVT::i16;
+         NeedTrunc = true;
+       }
+ 
+       unsigned Opcode = 0;
+       SDVTList LdResVTs;
+ 
+       switch (NumElts) {
+       default:
+         return;
+       case 2:
+         switch (IntrinNo) {
+         default:
+           return;
+         case Intrinsic::nvvm_ldg_global_i:
+         case Intrinsic::nvvm_ldg_global_f:
+         case Intrinsic::nvvm_ldg_global_p:
+           Opcode = NVPTXISD::LDGV2;
+           break;
+         case Intrinsic::nvvm_ldu_global_i:
+         case Intrinsic::nvvm_ldu_global_f:
+         case Intrinsic::nvvm_ldu_global_p:
+           Opcode = NVPTXISD::LDUV2;
+           break;
+         }
+         LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+         break;
+       case 4: {
+         switch (IntrinNo) {
+         default:
+           return;
+         case Intrinsic::nvvm_ldg_global_i:
+         case Intrinsic::nvvm_ldg_global_f:
+         case Intrinsic::nvvm_ldg_global_p:
+           Opcode = NVPTXISD::LDGV4;
+           break;
+         case Intrinsic::nvvm_ldu_global_i:
+         case Intrinsic::nvvm_ldu_global_f:
+         case Intrinsic::nvvm_ldu_global_p:
+           Opcode = NVPTXISD::LDUV4;
+           break;
+         }
+         EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+         LdResVTs = DAG.getVTList(ListVTs);
+         break;
+       }
+       }
+ 
+       SmallVector<SDValue, 8> OtherOps;
+ 
+       // Copy regular operands
+ 
+       OtherOps.push_back(Chain); // Chain
+                                  // Skip operand 1 (intrinsic ID)
+       // Others
+       OtherOps.append(N->op_begin() + 2, N->op_end());
+ 
+       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+ 
+       SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                               MemSD->getMemoryVT(),
+                                               MemSD->getMemOperand());
+ 
+       SmallVector<SDValue, 4> ScalarRes;
+ 
+       for (unsigned i = 0; i < NumElts; ++i) {
+         SDValue Res = NewLD.getValue(i);
+         if (NeedTrunc)
+           Res =
+               DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+         ScalarRes.push_back(Res);
+       }
+ 
+       SDValue LoadChain = NewLD.getValue(NumElts);
+ 
+       SDValue BuildVec =
+           DAG.getBuildVector(ResVT, DL, ScalarRes);
+ 
+       Results.push_back(BuildVec);
+       Results.push_back(LoadChain);
+     } else {
+       // i8 LDG/LDU
+       assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
+              "Custom handling of non-i8 ldu/ldg?");
+ 
+       // Just copy all operands as-is
+       SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
+ 
+       // Force output to i16
+       SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
+ 
+       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+ 
+       // We make sure the memory type is i8, which will be used during isel
+       // to select the proper instruction.
+       SDValue NewLD =
+           DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
+                                   MVT::i8, MemSD->getMemOperand());
+ 
+       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+                                     NewLD.getValue(0)));
+       Results.push_back(NewLD.getValue(1));
+     }
+   }
+   }
+ }
+ 
+ void NVPTXTargetLowering::ReplaceNodeResults(
+     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+   switch (N->getOpcode()) {
+   default:
+     report_fatal_error("Unhandled custom legalization");
+   case ISD::LOAD:
+     ReplaceLoadVector(N, DAG, Results);
+     return;
+   case ISD::INTRINSIC_W_CHAIN:
+     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
+     return;
+   }
+ }
+ 
+ // Pin NVPTXTargetObjectFile's vtables to this file.
+ NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {}
+ 
+ MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
+     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+   return getDataSection();
+ }
+diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
+index 3e109f75b66..66fab2b6f48 100644
+--- a/lib/Target/NVPTX/NVPTXISelLowering.h
++++ b/lib/Target/NVPTX/NVPTXISelLowering.h
+@@ -1,580 +1,581 @@
+ //===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===//
+ //
+ //                     The LLVM Compiler Infrastructure
+ //
+ // This file is distributed under the University of Illinois Open Source
+ // License. See LICENSE.TXT for details.
+ //
+ //===----------------------------------------------------------------------===//
+ //
+ // This file defines the interfaces that NVPTX uses to lower LLVM code into a
+ // selection DAG.
+ //
+ //===----------------------------------------------------------------------===//
+ 
+ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXISELLOWERING_H
+ #define LLVM_LIB_TARGET_NVPTX_NVPTXISELLOWERING_H
+ 
+ #include "NVPTX.h"
+ #include "llvm/CodeGen/SelectionDAG.h"
+ #include "llvm/CodeGen/TargetLowering.h"
+ 
+ namespace llvm {
+ namespace NVPTXISD {
+ enum NodeType : unsigned {
+   // Start the numbering from where ISD NodeType finishes.
+   FIRST_NUMBER = ISD::BUILTIN_OP_END,
+   Wrapper,
+   CALL,
+   RET_FLAG,
+   LOAD_PARAM,
+   DeclareParam,
+   DeclareScalarParam,
+   DeclareRetParam,
+   DeclareRet,
+   DeclareScalarRet,
+   PrintCall,
+   PrintConvergentCall,
+   PrintCallUni,
+   PrintConvergentCallUni,
+   CallArgBegin,
+   CallArg,
+   LastCallArg,
+   CallArgEnd,
+   CallVoid,
+   CallVal,
+   CallSymbol,
+   Prototype,
+   MoveParam,
+   PseudoUseParam,
+   RETURN,
+   CallSeqBegin,
+   CallSeqEnd,
+   CallPrototype,
++  ProxyReg,
+   FUN_SHFL_CLAMP,
+   FUN_SHFR_CLAMP,
+   MUL_WIDE_SIGNED,
+   MUL_WIDE_UNSIGNED,
+   IMAD,
+   SETP_F16X2,
+   Dummy,
+ 
+   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
+   LoadV4,
+   LDGV2, // LDG.v2
+   LDGV4, // LDG.v4
+   LDUV2, // LDU.v2
+   LDUV4, // LDU.v4
+   StoreV2,
+   StoreV4,
+   LoadParam,
+   LoadParamV2,
+   LoadParamV4,
+   StoreParam,
+   StoreParamV2,
+   StoreParamV4,
+   StoreParamS32, // to sext and store a <32bit value, not used currently
+   StoreParamU32, // to zext and store a <32bit value, not used currently
+   StoreRetval,
+   StoreRetvalV2,
+   StoreRetvalV4,
+ 
+   // Texture intrinsics
+   Tex1DFloatS32,
+   Tex1DFloatFloat,
+   Tex1DFloatFloatLevel,
+   Tex1DFloatFloatGrad,
+   Tex1DS32S32,
+   Tex1DS32Float,
+   Tex1DS32FloatLevel,
+   Tex1DS32FloatGrad,
+   Tex1DU32S32,
+   Tex1DU32Float,
+   Tex1DU32FloatLevel,
+   Tex1DU32FloatGrad,
+   Tex1DArrayFloatS32,
+   Tex1DArrayFloatFloat,
+   Tex1DArrayFloatFloatLevel,
+   Tex1DArrayFloatFloatGrad,
+   Tex1DArrayS32S32,
+   Tex1DArrayS32Float,
+   Tex1DArrayS32FloatLevel,
+   Tex1DArrayS32FloatGrad,
+   Tex1DArrayU32S32,
+   Tex1DArrayU32Float,
+   Tex1DArrayU32FloatLevel,
+   Tex1DArrayU32FloatGrad,
+   Tex2DFloatS32,
+   Tex2DFloatFloat,
+   Tex2DFloatFloatLevel,
+   Tex2DFloatFloatGrad,
+   Tex2DS32S32,
+   Tex2DS32Float,
+   Tex2DS32FloatLevel,
+   Tex2DS32FloatGrad,
+   Tex2DU32S32,
+   Tex2DU32Float,
+   Tex2DU32FloatLevel,
+   Tex2DU32FloatGrad,
+   Tex2DArrayFloatS32,
+   Tex2DArrayFloatFloat,
+   Tex2DArrayFloatFloatLevel,
+   Tex2DArrayFloatFloatGrad,
+   Tex2DArrayS32S32,
+   Tex2DArrayS32Float,
+   Tex2DArrayS32FloatLevel,
+   Tex2DArrayS32FloatGrad,
+   Tex2DArrayU32S32,
+   Tex2DArrayU32Float,
+   Tex2DArrayU32FloatLevel,
+   Tex2DArrayU32FloatGrad,
+   Tex3DFloatS32,
+   Tex3DFloatFloat,
+   Tex3DFloatFloatLevel,
+   Tex3DFloatFloatGrad,
+   Tex3DS32S32,
+   Tex3DS32Float,
+   Tex3DS32FloatLevel,
+   Tex3DS32FloatGrad,
+   Tex3DU32S32,
+   Tex3DU32Float,
+   Tex3DU32FloatLevel,
+   Tex3DU32FloatGrad,
+   TexCubeFloatFloat,
+   TexCubeFloatFloatLevel,
+   TexCubeS32Float,
+   TexCubeS32FloatLevel,
+   TexCubeU32Float,
+   TexCubeU32FloatLevel,
+   TexCubeArrayFloatFloat,
+   TexCubeArrayFloatFloatLevel,
+   TexCubeArrayS32Float,
+   TexCubeArrayS32FloatLevel,
+   TexCubeArrayU32Float,
+   TexCubeArrayU32FloatLevel,
+   Tld4R2DFloatFloat,
+   Tld4G2DFloatFloat,
+   Tld4B2DFloatFloat,
+   Tld4A2DFloatFloat,
+   Tld4R2DS64Float,
+   Tld4G2DS64Float,
+   Tld4B2DS64Float,
+   Tld4A2DS64Float,
+   Tld4R2DU64Float,
+   Tld4G2DU64Float,
+   Tld4B2DU64Float,
+   Tld4A2DU64Float,
+   TexUnified1DFloatS32,
+   TexUnified1DFloatFloat,
+   TexUnified1DFloatFloatLevel,
+   TexUnified1DFloatFloatGrad,
+   TexUnified1DS32S32,
+   TexUnified1DS32Float,
+   TexUnified1DS32FloatLevel,
+   TexUnified1DS32FloatGrad,
+   TexUnified1DU32S32,
+   TexUnified1DU32Float,
+   TexUnified1DU32FloatLevel,
+   TexUnified1DU32FloatGrad,
+   TexUnified1DArrayFloatS32,
+   TexUnified1DArrayFloatFloat,
+   TexUnified1DArrayFloatFloatLevel,
+   TexUnified1DArrayFloatFloatGrad,
+   TexUnified1DArrayS32S32,
+   TexUnified1DArrayS32Float,
+   TexUnified1DArrayS32FloatLevel,
+   TexUnified1DArrayS32FloatGrad,
+   TexUnified1DArrayU32S32,
+   TexUnified1DArrayU32Float,
+   TexUnified1DArrayU32FloatLevel,
+   TexUnified1DArrayU32FloatGrad,
+   TexUnified2DFloatS32,
+   TexUnified2DFloatFloat,
+   TexUnified2DFloatFloatLevel,
+   TexUnified2DFloatFloatGrad,
+   TexUnified2DS32S32,
+   TexUnified2DS32Float,
+   TexUnified2DS32FloatLevel,
+   TexUnified2DS32FloatGrad,
+   TexUnified2DU32S32,
+   TexUnified2DU32Float,
+   TexUnified2DU32FloatLevel,
+   TexUnified2DU32FloatGrad,
+   TexUnified2DArrayFloatS32,
+   TexUnified2DArrayFloatFloat,
+   TexUnified2DArrayFloatFloatLevel,
+   TexUnified2DArrayFloatFloatGrad,
+   TexUnified2DArrayS32S32,
+   TexUnified2DArrayS32Float,
+   TexUnified2DArrayS32FloatLevel,
+   TexUnified2DArrayS32FloatGrad,
+   TexUnified2DArrayU32S32,
+   TexUnified2DArrayU32Float,
+   TexUnified2DArrayU32FloatLevel,
+   TexUnified2DArrayU32FloatGrad,
+   TexUnified3DFloatS32,
+   TexUnified3DFloatFloat,
+   TexUnified3DFloatFloatLevel,
+   TexUnified3DFloatFloatGrad,
+   TexUnified3DS32S32,
+   TexUnified3DS32Float,
+   TexUnified3DS32FloatLevel,
+   TexUnified3DS32FloatGrad,
+   TexUnified3DU32S32,
+   TexUnified3DU32Float,
+   TexUnified3DU32FloatLevel,
+   TexUnified3DU32FloatGrad,
+   TexUnifiedCubeFloatFloat,
+   TexUnifiedCubeFloatFloatLevel,
+   TexUnifiedCubeS32Float,
+   TexUnifiedCubeS32FloatLevel,
+   TexUnifiedCubeU32Float,
+   TexUnifiedCubeU32FloatLevel,
+   TexUnifiedCubeArrayFloatFloat,
+   TexUnifiedCubeArrayFloatFloatLevel,
+   TexUnifiedCubeArrayS32Float,
+   TexUnifiedCubeArrayS32FloatLevel,
+   TexUnifiedCubeArrayU32Float,
+   TexUnifiedCubeArrayU32FloatLevel,
+   Tld4UnifiedR2DFloatFloat,
+   Tld4UnifiedG2DFloatFloat,
+   Tld4UnifiedB2DFloatFloat,
+   Tld4UnifiedA2DFloatFloat,
+   Tld4UnifiedR2DS64Float,
+   Tld4UnifiedG2DS64Float,
+   Tld4UnifiedB2DS64Float,
+   Tld4UnifiedA2DS64Float,
+   Tld4UnifiedR2DU64Float,
+   Tld4UnifiedG2DU64Float,
+   Tld4UnifiedB2DU64Float,
+   Tld4UnifiedA2DU64Float,
+ 
+   // Surface intrinsics
+   Suld1DI8Clamp,
+   Suld1DI16Clamp,
+   Suld1DI32Clamp,
+   Suld1DI64Clamp,
+   Suld1DV2I8Clamp,
+   Suld1DV2I16Clamp,
+   Suld1DV2I32Clamp,
+   Suld1DV2I64Clamp,
+   Suld1DV4I8Clamp,
+   Suld1DV4I16Clamp,
+   Suld1DV4I32Clamp,
+ 
+   Suld1DArrayI8Clamp,
+   Suld1DArrayI16Clamp,
+   Suld1DArrayI32Clamp,
+   Suld1DArrayI64Clamp,
+   Suld1DArrayV2I8Clamp,
+   Suld1DArrayV2I16Clamp,
+   Suld1DArrayV2I32Clamp,
+   Suld1DArrayV2I64Clamp,
+   Suld1DArrayV4I8Clamp,
+   Suld1DArrayV4I16Clamp,
+   Suld1DArrayV4I32Clamp,
+ 
+   Suld2DI8Clamp,
+   Suld2DI16Clamp,
+   Suld2DI32Clamp,
+   Suld2DI64Clamp,
+   Suld2DV2I8Clamp,
+   Suld2DV2I16Clamp,
+   Suld2DV2I32Clamp,
+   Suld2DV2I64Clamp,
+   Suld2DV4I8Clamp,
+   Suld2DV4I16Clamp,
+   Suld2DV4I32Clamp,
+ 
+   Suld2DArrayI8Clamp,
+   Suld2DArrayI16Clamp,
+   Suld2DArrayI32Clamp,
+   Suld2DArrayI64Clamp,
+   Suld2DArrayV2I8Clamp,
+   Suld2DArrayV2I16Clamp,
+   Suld2DArrayV2I32Clamp,
+   Suld2DArrayV2I64Clamp,
+   Suld2DArrayV4I8Clamp,
+   Suld2DArrayV4I16Clamp,
+   Suld2DArrayV4I32Clamp,
+ 
+   Suld3DI8Clamp,
+   Suld3DI16Clamp,
+   Suld3DI32Clamp,
+   Suld3DI64Clamp,
+   Suld3DV2I8Clamp,
+   Suld3DV2I16Clamp,
+   Suld3DV2I32Clamp,
+   Suld3DV2I64Clamp,
+   Suld3DV4I8Clamp,
+   Suld3DV4I16Clamp,
+   Suld3DV4I32Clamp,
+ 
+   Suld1DI8Trap,
+   Suld1DI16Trap,
+   Suld1DI32Trap,
+   Suld1DI64Trap,
+   Suld1DV2I8Trap,
+   Suld1DV2I16Trap,
+   Suld1DV2I32Trap,
+   Suld1DV2I64Trap,
+   Suld1DV4I8Trap,
+   Suld1DV4I16Trap,
+   Suld1DV4I32Trap,
+ 
+   Suld1DArrayI8Trap,
+   Suld1DArrayI16Trap,
+   Suld1DArrayI32Trap,
+   Suld1DArrayI64Trap,
+   Suld1DArrayV2I8Trap,
+   Suld1DArrayV2I16Trap,
+   Suld1DArrayV2I32Trap,
+   Suld1DArrayV2I64Trap,
+   Suld1DArrayV4I8Trap,
+   Suld1DArrayV4I16Trap,
+   Suld1DArrayV4I32Trap,
+ 
+   Suld2DI8Trap,
+   Suld2DI16Trap,
+   Suld2DI32Trap,
+   Suld2DI64Trap,
+   Suld2DV2I8Trap,
+   Suld2DV2I16Trap,
+   Suld2DV2I32Trap,
+   Suld2DV2I64Trap,
+   Suld2DV4I8Trap,
+   Suld2DV4I16Trap,
+   Suld2DV4I32Trap,
+ 
+   Suld2DArrayI8Trap,
+   Suld2DArrayI16Trap,
+   Suld2DArrayI32Trap,
+   Suld2DArrayI64Trap,
+   Suld2DArrayV2I8Trap,
+   Suld2DArrayV2I16Trap,
+   Suld2DArrayV2I32Trap,
+   Suld2DArrayV2I64Trap,
+   Suld2DArrayV4I8Trap,
+   Suld2DArrayV4I16Trap,
+   Suld2DArrayV4I32Trap,
+ 
+   Suld3DI8Trap,
+   Suld3DI16Trap,
+   Suld3DI32Trap,
+   Suld3DI64Trap,
+   Suld3DV2I8Trap,
+   Suld3DV2I16Trap,
+   Suld3DV2I32Trap,
+   Suld3DV2I64Trap,
+   Suld3DV4I8Trap,
+   Suld3DV4I16Trap,
+   Suld3DV4I32Trap,
+ 
+   Suld1DI8Zero,
+   Suld1DI16Zero,
+   Suld1DI32Zero,
+   Suld1DI64Zero,
+   Suld1DV2I8Zero,
+   Suld1DV2I16Zero,
+   Suld1DV2I32Zero,
+   Suld1DV2I64Zero,
+   Suld1DV4I8Zero,
+   Suld1DV4I16Zero,
+   Suld1DV4I32Zero,
+ 
+   Suld1DArrayI8Zero,
+   Suld1DArrayI16Zero,
+   Suld1DArrayI32Zero,
+   Suld1DArrayI64Zero,
+   Suld1DArrayV2I8Zero,
+   Suld1DArrayV2I16Zero,
+   Suld1DArrayV2I32Zero,
+   Suld1DArrayV2I64Zero,
+   Suld1DArrayV4I8Zero,
+   Suld1DArrayV4I16Zero,
+   Suld1DArrayV4I32Zero,
+ 
+   Suld2DI8Zero,
+   Suld2DI16Zero,
+   Suld2DI32Zero,
+   Suld2DI64Zero,
+   Suld2DV2I8Zero,
+   Suld2DV2I16Zero,
+   Suld2DV2I32Zero,
+   Suld2DV2I64Zero,
+   Suld2DV4I8Zero,
+   Suld2DV4I16Zero,
+   Suld2DV4I32Zero,
+ 
+   Suld2DArrayI8Zero,
+   Suld2DArrayI16Zero,
+   Suld2DArrayI32Zero,
+   Suld2DArrayI64Zero,
+   Suld2DArrayV2I8Zero,
+   Suld2DArrayV2I16Zero,
+   Suld2DArrayV2I32Zero,
+   Suld2DArrayV2I64Zero,
+   Suld2DArrayV4I8Zero,
+   Suld2DArrayV4I16Zero,
+   Suld2DArrayV4I32Zero,
+ 
+   Suld3DI8Zero,
+   Suld3DI16Zero,
+   Suld3DI32Zero,
+   Suld3DI64Zero,
+   Suld3DV2I8Zero,
+   Suld3DV2I16Zero,
+   Suld3DV2I32Zero,
+   Suld3DV2I64Zero,
+   Suld3DV4I8Zero,
+   Suld3DV4I16Zero,
+   Suld3DV4I32Zero
+ };
+ }
+ 
+ class NVPTXSubtarget;
+ 
+ //===--------------------------------------------------------------------===//
+ // TargetLowering Implementation
+ //===--------------------------------------------------------------------===//
+ class NVPTXTargetLowering : public TargetLowering {
+ public:
+   explicit NVPTXTargetLowering(const NVPTXTargetMachine &TM,
+                                const NVPTXSubtarget &STI);
+   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ 
+   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ 
+   const char *getTargetNodeName(unsigned Opcode) const override;
+ 
+   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                           MachineFunction &MF,
+                           unsigned Intrinsic) const override;
+ 
+   /// isLegalAddressingMode - Return true if the addressing mode represented
+   /// by AM is legal for this target, for a load/store of the specified type
+   /// Used to guide target specific optimizations, like loop strength
+   /// reduction (LoopStrengthReduce.cpp) and memory optimization for
+   /// address mode (CodeGenPrepare.cpp)
+   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                              unsigned AS,
+                              Instruction *I = nullptr) const override;
+ 
+   bool isTruncateFree(Type *SrcTy, Type *DstTy) const override {
+     // Truncating 64-bit to 32-bit is free in SASS.
+     if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
+       return false;
+     return SrcTy->getPrimitiveSizeInBits() == 64 &&
+            DstTy->getPrimitiveSizeInBits() == 32;
+   }
+ 
+   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
+                          EVT VT) const override {
+     if (VT.isVector())
+       return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
+     return MVT::i1;
+   }
+ 
+   ConstraintType getConstraintType(StringRef Constraint) const override;
+   std::pair<unsigned, const TargetRegisterClass *>
+   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                StringRef Constraint, MVT VT) const override;
+ 
+   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                                bool isVarArg,
+                                const SmallVectorImpl<ISD::InputArg> &Ins,
+                                const SDLoc &dl, SelectionDAG &DAG,
+                                SmallVectorImpl<SDValue> &InVals) const override;
+ 
+   SDValue LowerCall(CallLoweringInfo &CLI,
+                     SmallVectorImpl<SDValue> &InVals) const override;
+ 
+   std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &,
+                            const SmallVectorImpl<ISD::OutputArg> &,
+                            unsigned retAlignment,
+                            ImmutableCallSite CS) const;
+ 
+   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::OutputArg> &Outs,
+                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+                       SelectionDAG &DAG) const override;
+ 
+   void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                     std::vector<SDValue> &Ops,
+                                     SelectionDAG &DAG) const override;
+ 
+   const NVPTXTargetMachine *nvTM;
+ 
+   // PTX always uses 32-bit shift amounts
+   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+     return MVT::i32;
+   }
+ 
+   TargetLoweringBase::LegalizeTypeAction
+   getPreferredVectorAction(MVT VT) const override;
+ 
+   // Get the degree of precision we want from 32-bit floating point division
+   // operations.
+   //
+   //  0 - Use ptx div.approx
+   //  1 - Use ptx.div.full (approximate, but less so than div.approx)
+   //  2 - Use IEEE-compliant div instructions, if available.
+   int getDivF32Level() const;
+ 
+   // Get whether we should use a precise or approximate 32-bit floating point
+   // sqrt instruction.
+   bool usePrecSqrtF32() const;
+ 
+   // Get whether we should use instructions that flush floating-point denormals
+   // to sign-preserving zero.
+   bool useF32FTZ(const MachineFunction &MF) const;
+ 
+   SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                           int &ExtraSteps, bool &UseOneConst,
+                           bool Reciprocal) const override;
+ 
+   unsigned combineRepeatedFPDivisors() const override { return 2; }
+ 
+   bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
+   bool allowUnsafeFPMath(MachineFunction &MF) const;
+ 
+   bool isFMAFasterThanFMulAndFAdd(EVT) const override { return true; }
+ 
+   bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
+ 
+   // The default is to transform llvm.ctlz(x, false) (where false indicates that
+   // x == 0 is not undefined behavior) into a branch that checks whether x is 0
+   // and avoids calling ctlz in that case.  We have a dedicated ctlz
+   // instruction, so we say that ctlz is cheap to speculate.
+   bool isCheapToSpeculateCtlz() const override { return true; }
+ 
+ private:
+   const NVPTXSubtarget &STI; // cache the subtarget here
+   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
+ 
+   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ 
+   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+   SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
+ 
+   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+   SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
+   SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
+ 
+   SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+   SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+ 
+   SDValue LowerSelect(SDValue Op, SelectionDAG &DAG) const;
+ 
+   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                           SelectionDAG &DAG) const override;
+   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ 
+   unsigned getArgumentAlignment(SDValue Callee, ImmutableCallSite CS, Type *Ty,
+                                 unsigned Idx, const DataLayout &DL) const;
+ };
+ } // namespace llvm
+ 
+ #endif
+diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
+index 48db941db9b..c73e65c6efe 100644
+--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
++++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
+@@ -1,3122 +1,3141 @@
+ //===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
+ //
+ //                     The LLVM Compiler Infrastructure
+ //
+ // This file is distributed under the University of Illinois Open Source
+ // License. See LICENSE.TXT for details.
+ //
+ //===----------------------------------------------------------------------===//
+ //
+ // This file describes the PTX instructions in TableGen format.
+ //
+ //===----------------------------------------------------------------------===//
+ 
+ include "NVPTXInstrFormats.td"
+ 
+ // A NOP instruction
+ let hasSideEffects = 0 in {
+   def NOP : NVPTXInst<(outs), (ins), "", []>;
+ }
+ 
+ let OperandType = "OPERAND_IMMEDIATE" in {
+   def f16imm : Operand<f16>;
+ }
+ 
+ // List of vector specific properties
+ def isVecLD      : VecInstTypeEnum<1>;
+ def isVecST      : VecInstTypeEnum<2>;
+ def isVecBuild   : VecInstTypeEnum<3>;
+ def isVecShuffle : VecInstTypeEnum<4>;
+ def isVecExtract : VecInstTypeEnum<5>;
+ def isVecInsert  : VecInstTypeEnum<6>;
+ def isVecDest    : VecInstTypeEnum<7>;
+ def isVecOther   : VecInstTypeEnum<15>;
+ 
+ //===----------------------------------------------------------------------===//
+ // NVPTX Operand Definitions.
+ //===----------------------------------------------------------------------===//
+ 
+ def brtarget    : Operand<OtherVT>;
+ 
+ // CVT conversion modes
+ // These must match the enum in NVPTX.h
+ def CvtNONE : PatLeaf<(i32 0x0)>;
+ def CvtRNI  : PatLeaf<(i32 0x1)>;
+ def CvtRZI  : PatLeaf<(i32 0x2)>;
+ def CvtRMI  : PatLeaf<(i32 0x3)>;
+ def CvtRPI  : PatLeaf<(i32 0x4)>;
+ def CvtRN   : PatLeaf<(i32 0x5)>;
+ def CvtRZ   : PatLeaf<(i32 0x6)>;
+ def CvtRM   : PatLeaf<(i32 0x7)>;
+ def CvtRP   : PatLeaf<(i32 0x8)>;
+ 
+ def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
+ def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
+ def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
+ def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
+ def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
+ def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
+ def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
+ def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
+ def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
+ 
+ def CvtSAT      : PatLeaf<(i32 0x20)>;
+ def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
+ 
+ def CvtMode : Operand<i32> {
+   let PrintMethod = "printCvtMode";
+ }
+ 
+ // Compare modes
+ // These must match the enum in NVPTX.h
+ def CmpEQ   : PatLeaf<(i32 0)>;
+ def CmpNE   : PatLeaf<(i32 1)>;
+ def CmpLT   : PatLeaf<(i32 2)>;
+ def CmpLE   : PatLeaf<(i32 3)>;
+ def CmpGT   : PatLeaf<(i32 4)>;
+ def CmpGE   : PatLeaf<(i32 5)>;
+ def CmpEQU  : PatLeaf<(i32 10)>;
+ def CmpNEU  : PatLeaf<(i32 11)>;
+ def CmpLTU  : PatLeaf<(i32 12)>;
+ def CmpLEU  : PatLeaf<(i32 13)>;
+ def CmpGTU  : PatLeaf<(i32 14)>;
+ def CmpGEU  : PatLeaf<(i32 15)>;
+ def CmpNUM  : PatLeaf<(i32 16)>;
+ def CmpNAN  : PatLeaf<(i32 17)>;
+ 
+ def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
+ def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
+ def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
+ def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
+ def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
+ def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
+ def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
+ def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
+ def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
+ def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
+ def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
+ def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
+ def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
+ def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
+ 
+ def CmpMode : Operand<i32> {
+   let PrintMethod = "printCmpMode";
+ }
+ def VecElement : Operand<i32> {
+   let PrintMethod = "printVecElement";
+ }
+ 
+ //===----------------------------------------------------------------------===//
+ // NVPTX Instruction Predicate Definitions
+ //===----------------------------------------------------------------------===//
+ 
+ 
+ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
+ def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
+ def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
+ def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
+ def hasVote : Predicate<"Subtarget->hasVote()">;
+ def hasDouble : Predicate<"Subtarget->hasDouble()">;
+ def hasLDG : Predicate<"Subtarget->hasLDG()">;
+ def hasLDU : Predicate<"Subtarget->hasLDU()">;
+ 
+ def doF32FTZ : Predicate<"useF32FTZ()">;
+ def doNoF32FTZ : Predicate<"!useF32FTZ()">;
+ 
+ def doMulWide      : Predicate<"doMulWide">;
+ 
+ def allowFMA : Predicate<"allowFMA()">;
+ def noFMA : Predicate<"!allowFMA()">;
+ def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
+ 
+ def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
+ def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
+ 
+ def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
+ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
+ 
+ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
+ def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
+ 
+ def true : Predicate<"true">;
+ 
+ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
+ def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
+ def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
+ 
+ def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
+ def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
+ 
+ def useShortPtr : Predicate<"useShortPointers()">;
+ def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
+ 
+ //===----------------------------------------------------------------------===//
+ // Some Common Instruction Class Templates
+ //===----------------------------------------------------------------------===//
+ 
+ // Template for instructions which take three int64, int32, or int16 args.
+ // The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
+ multiclass I3<string OpcStr, SDNode OpNode> {
+   def i64rr :
+     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+   def i64ri :
+     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+   def i32rr :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+   def i32ri :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+   def i16rr :
+     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+   def i16ri :
+     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+ }
+ 
+ // Template for instructions which take 3 int32 args.  The instructions are
+ // named "<OpcStr>.s32" (e.g. "addc.cc.s32").
+ multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
+    def i32rr :
+      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+    def i32ri :
+      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+ }
+ 
+ // Template for instructions which take three fp64 or fp32 args.  The
+ // instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
+ //
+ // Also defines ftz (flush subnormal inputs and results to sign-preserving
+ // zero) variants for fp32 functions.
+ //
+ // This multiclass should be used for nodes that cannot be folded into FMAs.
+ // For nodes that can be folded into FMAs (i.e. adds and muls), use
+ // F3_fma_component.
+ multiclass F3<string OpcStr, SDNode OpNode> {
+    def f64rr :
+      NVPTXInst<(outs Float64Regs:$dst),
+                (ins Float64Regs:$a, Float64Regs:$b),
+                !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+    def f64ri :
+      NVPTXInst<(outs Float64Regs:$dst),
+                (ins Float64Regs:$a, f64imm:$b),
+                !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+    def f32rr_ftz :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, Float32Regs:$b),
+                !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                Requires<[doF32FTZ]>;
+    def f32ri_ftz :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, f32imm:$b),
+                !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+                Requires<[doF32FTZ]>;
+    def f32rr :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, Float32Regs:$b),
+                !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+    def f32ri :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, f32imm:$b),
+                !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+ }
+ 
+ // Template for instructions which take three FP args.  The
+ // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
+ //
+ // Also defines ftz (flush subnormal inputs and results to sign-preserving
+ // zero) variants for fp32/fp16 functions.
+ //
+ // This multiclass should be used for nodes that can be folded to make fma ops.
+ // In this case, we use the ".rn" variant when FMA is disabled, as this behaves
+ // just like the non ".rn" op, but prevents ptxas from creating FMAs.
+ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
+    def f64rr :
+      NVPTXInst<(outs Float64Regs:$dst),
+                (ins Float64Regs:$a, Float64Regs:$b),
+                !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+                Requires<[allowFMA]>;
+    def f64ri :
+      NVPTXInst<(outs Float64Regs:$dst),
+                (ins Float64Regs:$a, f64imm:$b),
+                !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+                Requires<[allowFMA]>;
+    def f32rr_ftz :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, Float32Regs:$b),
+                !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                Requires<[allowFMA, doF32FTZ]>;
+    def f32ri_ftz :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, f32imm:$b),
+                !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+                Requires<[allowFMA, doF32FTZ]>;
+    def f32rr :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, Float32Regs:$b),
+                !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                Requires<[allowFMA]>;
+    def f32ri :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, f32imm:$b),
+                !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+                Requires<[allowFMA]>;
+ 
+    def f16rr_ftz :
+      NVPTXInst<(outs Float16Regs:$dst),
+                (ins Float16Regs:$a, Float16Regs:$b),
+                !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
+                [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+                Requires<[useFP16Math, allowFMA, doF32FTZ]>;
+    def f16rr :
+      NVPTXInst<(outs Float16Regs:$dst),
+                (ins Float16Regs:$a, Float16Regs:$b),
+                !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
+                [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+                Requires<[useFP16Math, allowFMA]>;
+ 
+    def f16x2rr_ftz :
+      NVPTXInst<(outs Float16x2Regs:$dst),
+                (ins Float16x2Regs:$a, Float16x2Regs:$b),
+                !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
+                [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+                Requires<[useFP16Math, allowFMA, doF32FTZ]>;
+    def f16x2rr :
+      NVPTXInst<(outs Float16x2Regs:$dst),
+                (ins Float16x2Regs:$a, Float16x2Regs:$b),
+                !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
+                [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+                Requires<[useFP16Math, allowFMA]>;
+ 
+    // These have strange names so we don't perturb existing mir tests.
+    def _rnf64rr :
+      NVPTXInst<(outs Float64Regs:$dst),
+                (ins Float64Regs:$a, Float64Regs:$b),
+                !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+                Requires<[noFMA]>;
+    def _rnf64ri :
+      NVPTXInst<(outs Float64Regs:$dst),
+                (ins Float64Regs:$a, f64imm:$b),
+                !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+                Requires<[noFMA]>;
+    def _rnf32rr_ftz :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, Float32Regs:$b),
+                !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                Requires<[noFMA, doF32FTZ]>;
+    def _rnf32ri_ftz :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, f32imm:$b),
+                !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+                Requires<[noFMA, doF32FTZ]>;
+    def _rnf32rr :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, Float32Regs:$b),
+                !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                Requires<[noFMA]>;
+    def _rnf32ri :
+      NVPTXInst<(outs Float32Regs:$dst),
+                (ins Float32Regs:$a, f32imm:$b),
+                !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+                Requires<[noFMA]>;
+    def _rnf16rr_ftz :
+      NVPTXInst<(outs Float16Regs:$dst),
+                (ins Float16Regs:$a, Float16Regs:$b),
+                !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
+                [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+                Requires<[useFP16Math, noFMA, doF32FTZ]>;
+    def _rnf16rr :
+      NVPTXInst<(outs Float16Regs:$dst),
+                (ins Float16Regs:$a, Float16Regs:$b),
+                !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
+                [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+                Requires<[useFP16Math, noFMA]>;
+    def _rnf16x2rr_ftz :
+      NVPTXInst<(outs Float16x2Regs:$dst),
+                (ins Float16x2Regs:$a, Float16x2Regs:$b),
+                !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
+                [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+                Requires<[useFP16Math, noFMA, doF32FTZ]>;
+    def _rnf16x2rr :
+      NVPTXInst<(outs Float16x2Regs:$dst),
+                (ins Float16x2Regs:$a, Float16x2Regs:$b),
+                !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
+                [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+                Requires<[useFP16Math, noFMA]>;
+ }
+ 
+ // Template for operations which take two f32 or f64 operands.  Provides three
+ // instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
+ // subnormal inputs and results to zero).
+ multiclass F2<string OpcStr, SDNode OpNode> {
+    def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+                            !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+                            [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+    def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                            !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+                            [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+                            Requires<[doF32FTZ]>;
+    def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                            !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+                            [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+ }
+ 
+ //===----------------------------------------------------------------------===//
+ // NVPTX Instructions.
+ //===----------------------------------------------------------------------===//
+ 
+ //-----------------------------------
+ // Type Conversion
+ //-----------------------------------
+ 
+ let hasSideEffects = 0 in {
+   // Generate a cvt to the given type from all possible types.  Each instance
+   // takes a CvtMode immediate that defines the conversion mode to use.  It can
+   // be CvtNONE to omit a conversion mode.
+   multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
+     def _s8 :
+       NVPTXInst<(outs RC:$dst),
+                 (ins Int16Regs:$src, CvtMode:$mode),
+                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                 FromName, ".s8 \t$dst, $src;"), []>;
+     def _u8 :
+       NVPTXInst<(outs RC:$dst),
+                 (ins Int16Regs:$src, CvtMode:$mode),
+                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                 FromName, ".u8 \t$dst, $src;"), []>;
+     def _s16 :
+       NVPTXInst<(outs RC:$dst),
+                 (ins Int16Regs:$src, CvtMode:$mode),
+                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                 FromName, ".s16 \t$dst, $src;"), []>;
+     def _u16 :
+       NVPTXInst<(outs RC:$dst),
+                 (ins Int16Regs:$src, CvtMode:$mode),
+                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                 FromName, ".u16 \t$dst, $src;"), []>;
+     def _s32 :
+       NVPTXInst<(outs RC:$dst),
+                 (ins Int32Regs:$src, CvtMode:$mode),
+                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                 FromName, ".s32 \t$dst, $src;"), []>;
+     def _u32 :
+       NVPTXInst<(outs RC:$dst),
+                 (ins Int32Regs:$src, CvtMode:$mode),
+                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                 FromName, ".u32 \t$dst, $src;"), []>;
+     def _s64 :
+       NVPTXInst<(outs RC:$dst),
+                 (ins Int64Regs:$src, CvtMode:$mode),
+                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                 FromName, ".s64 \t$dst, $src;"), []>;
+     def _u64 :
+       NVPTXInst<(outs RC:$dst),
+                 (ins Int64Regs:$src, CvtMode:$mode),
+                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                 FromName, ".u64 \t$dst, $src;"), []>;
+     def _f16 :
+       NVPTXInst<(outs RC:$dst),
+                 (ins Float16Regs:$src, CvtMode:$mode),
+                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                 FromName, ".f16 \t$dst, $src;"), []>;
+     def _f32 :
+       NVPTXInst<(outs RC:$dst),
+                 (ins Float32Regs:$src, CvtMode:$mode),
+                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                 FromName, ".f32 \t$dst, $src;"), []>;
+     def _f64 :
+       NVPTXInst<(outs RC:$dst),
+                 (ins Float64Regs:$src, CvtMode:$mode),
+                 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                 FromName, ".f64 \t$dst, $src;"), []>;
+   }
+ 
+   // Generate cvts from all types to all types.
+   defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
+   defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
+   defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
+   defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
+   defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
+   defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
+   defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
+   defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
+   defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>;
+   defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
+   defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
+ 
+   // These cvts are different from those above: The source and dest registers
+   // are of the same type.
+   def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                                     "cvt.s16.s8 \t$dst, $src;", []>;
+   def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                                     "cvt.s32.s8 \t$dst, $src;", []>;
+   def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                                     "cvt.s32.s16 \t$dst, $src;", []>;
+   def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                     "cvt.s64.s8 \t$dst, $src;", []>;
+   def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                     "cvt.s64.s16 \t$dst, $src;", []>;
+   def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                     "cvt.s64.s32 \t$dst, $src;", []>;
+ }
+ 
+ //-----------------------------------
+ // Integer Arithmetic
+ //-----------------------------------
+ 
+ // Template for xor masquerading as int1 arithmetic.
+ multiclass ADD_SUB_i1<SDNode OpNode> {
+    def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+                       "xor.pred \t$dst, $a, $b;",
+                       [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+    def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+                       "xor.pred \t$dst, $a, $b;",
+                       [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+ }
+ 
+ // int1 addition and subtraction are both just xor.
+ defm ADD_i1 : ADD_SUB_i1<add>;
+ defm SUB_i1 : ADD_SUB_i1<sub>;
+ 
+ // int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
+ // also use these for unsigned arithmetic.
+ defm ADD : I3<"add.s", add>;
+ defm SUB : I3<"sub.s", sub>;
+ 
+ // int32 addition and subtraction with carry-out.
+ // FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
+ defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
+ defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
+ 
+ // int32 addition and subtraction with carry-in and carry-out.
+ defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
+ defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
+ 
+ defm MULT : I3<"mul.lo.s", mul>;
+ 
+ defm MULTHS : I3<"mul.hi.s", mulhs>;
+ defm MULTHU : I3<"mul.hi.u", mulhu>;
+ 
+ defm SDIV : I3<"div.s", sdiv>;
+ defm UDIV : I3<"div.u", udiv>;
+ 
+ // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
+ // will lower it.
+ defm SREM : I3<"rem.s", srem>;
+ defm UREM : I3<"rem.u", urem>;
+ 
+ // Integer absolute value.  NumBits should be one minus the bit width of RC.
+ // This idiom implements the algorithm at
+ // http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
+ multiclass ABS<RegisterClass RC, string SizeName> {
+   def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
+                   !strconcat("abs", SizeName, " \t$dst, $a;"),
+                   [(set RC:$dst, (abs RC:$a))]>;
+ }
+ defm ABS_16 : ABS<Int16Regs, ".s16">;
+ defm ABS_32 : ABS<Int32Regs, ".s32">;
+ defm ABS_64 : ABS<Int64Regs, ".s64">;
+ 
+ // Integer min/max.
+ defm SMAX : I3<"max.s", smax>;
+ defm UMAX : I3<"max.u", umax>;
+ defm SMIN : I3<"min.s", smin>;
+ defm UMIN : I3<"min.u", umin>;
+ 
+ //
+ // Wide multiplication
+ //
+ def MULWIDES64 :
+   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+             "mul.wide.s32 \t$dst, $a, $b;", []>;
+ def MULWIDES64Imm :
+   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+             "mul.wide.s32 \t$dst, $a, $b;", []>;
+ def MULWIDES64Imm64 :
+   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+             "mul.wide.s32 \t$dst, $a, $b;", []>;
+ 
+ def MULWIDEU64 :
+   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+             "mul.wide.u32 \t$dst, $a, $b;", []>;
+ def MULWIDEU64Imm :
+   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+             "mul.wide.u32 \t$dst, $a, $b;", []>;
+ def MULWIDEU64Imm64 :
+   NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+             "mul.wide.u32 \t$dst, $a, $b;", []>;
+ 
+ def MULWIDES32 :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+             "mul.wide.s16 \t$dst, $a, $b;", []>;
+ def MULWIDES32Imm :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+             "mul.wide.s16 \t$dst, $a, $b;", []>;
+ def MULWIDES32Imm32 :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+             "mul.wide.s16 \t$dst, $a, $b;", []>;
+ 
+ def MULWIDEU32 :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+             "mul.wide.u16 \t$dst, $a, $b;", []>;
+ def MULWIDEU32Imm :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+             "mul.wide.u16 \t$dst, $a, $b;", []>;
+ def MULWIDEU32Imm32 :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+             "mul.wide.u16 \t$dst, $a, $b;", []>;
+ 
+ def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
+ def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
+ def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+ 
+ // Matchers for signed, unsigned mul.wide ISD nodes.
+ def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
+           (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+       Requires<[doMulWide]>;
+ def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
+           (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+       Requires<[doMulWide]>;
+ def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
+           (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+       Requires<[doMulWide]>;
+ def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
+           (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+       Requires<[doMulWide]>;
+ 
+ def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
+           (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+       Requires<[doMulWide]>;
+ def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
+           (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+       Requires<[doMulWide]>;
+ def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
+           (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+       Requires<[doMulWide]>;
+ def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
+           (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
+       Requires<[doMulWide]>;
+ 
+ // Predicates used for converting some patterns to mul.wide.
+ def SInt32Const : PatLeaf<(imm), [{
+   const APInt &v = N->getAPIntValue();
+   return v.isSignedIntN(32);
+ }]>;
+ 
+ def UInt32Const : PatLeaf<(imm), [{
+   const APInt &v = N->getAPIntValue();
+   return v.isIntN(32);
+ }]>;
+ 
+ def SInt16Const : PatLeaf<(imm), [{
+   const APInt &v = N->getAPIntValue();
+   return v.isSignedIntN(16);
+ }]>;
+ 
+ def UInt16Const : PatLeaf<(imm), [{
+   const APInt &v = N->getAPIntValue();
+   return v.isIntN(16);
+ }]>;
+ 
+ def Int5Const : PatLeaf<(imm), [{
+   // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
+   const APInt &v = N->getAPIntValue();
+   return v.sge(0) && v.slt(32);
+ }]>;
+ 
+ def Int4Const : PatLeaf<(imm), [{
+   // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
+   const APInt &v = N->getAPIntValue();
+   return v.sge(0) && v.slt(16);
+ }]>;
+ 
+ def SHL2MUL32 : SDNodeXForm<imm, [{
+   const APInt &v = N->getAPIntValue();
+   APInt temp(32, 1);
+   return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
+ }]>;
+ 
+ def SHL2MUL16 : SDNodeXForm<imm, [{
+   const APInt &v = N->getAPIntValue();
+   APInt temp(16, 1);
+   return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
+ }]>;
+ 
+ // Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
+ def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
+           (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+       Requires<[doMulWide]>;
+ def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
+           (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+       Requires<[doMulWide]>;
+ 
+ def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
+           (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+       Requires<[doMulWide]>;
+ def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
+           (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+       Requires<[doMulWide]>;
+ 
+ // Convert "sign/zero-extend then multiply" to mul.wide.
+ def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
+           (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+       Requires<[doMulWide]>;
+ def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
+           (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
+       Requires<[doMulWide]>;
+ 
+ def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
+           (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+       Requires<[doMulWide]>;
+ def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
+           (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
+       Requires<[doMulWide]>;
+ 
+ def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
+           (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+       Requires<[doMulWide]>;
+ def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
+           (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
+       Requires<[doMulWide]>;
+ 
+ def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
+           (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+       Requires<[doMulWide]>;
+ def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
+           (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
+       Requires<[doMulWide]>;
+ 
+ //
+ // Integer multiply-add
+ //
+ def SDTIMAD :
+   SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
+                        SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
+ def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
+ 
+ def MAD16rrr :
+   NVPTXInst<(outs Int16Regs:$dst),
+             (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+             "mad.lo.s16 \t$dst, $a, $b, $c;",
+             [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
+ def MAD16rri :
+   NVPTXInst<(outs Int16Regs:$dst),
+             (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
+             "mad.lo.s16 \t$dst, $a, $b, $c;",
+             [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
+ def MAD16rir :
+   NVPTXInst<(outs Int16Regs:$dst),
+             (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
+             "mad.lo.s16 \t$dst, $a, $b, $c;",
+             [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
+ def MAD16rii :
+   NVPTXInst<(outs Int16Regs:$dst),
+             (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
+             "mad.lo.s16 \t$dst, $a, $b, $c;",
+             [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
+ 
+ def MAD32rrr :
+   NVPTXInst<(outs Int32Regs:$dst),
+             (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+             "mad.lo.s32 \t$dst, $a, $b, $c;",
+             [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
+ def MAD32rri :
+   NVPTXInst<(outs Int32Regs:$dst),
+             (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
+             "mad.lo.s32 \t$dst, $a, $b, $c;",
+             [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
+ def MAD32rir :
+   NVPTXInst<(outs Int32Regs:$dst),
+             (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+             "mad.lo.s32 \t$dst, $a, $b, $c;",
+             [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
+ def MAD32rii :
+   NVPTXInst<(outs Int32Regs:$dst),
+             (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
+             "mad.lo.s32 \t$dst, $a, $b, $c;",
+             [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
+ 
+ def MAD64rrr :
+   NVPTXInst<(outs Int64Regs:$dst),
+             (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+             "mad.lo.s64 \t$dst, $a, $b, $c;",
+             [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
+ def MAD64rri :
+   NVPTXInst<(outs Int64Regs:$dst),
+             (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
+             "mad.lo.s64 \t$dst, $a, $b, $c;",
+             [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
+ def MAD64rir :
+   NVPTXInst<(outs Int64Regs:$dst),
+             (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
+             "mad.lo.s64 \t$dst, $a, $b, $c;",
+             [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
+ def MAD64rii :
+   NVPTXInst<(outs Int64Regs:$dst),
+             (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
+             "mad.lo.s64 \t$dst, $a, $b, $c;",
+             [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
+ 
+ def INEG16 :
+   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+             "neg.s16 \t$dst, $src;",
+             [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+ def INEG32 :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+             "neg.s32 \t$dst, $src;",
+             [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
+ def INEG64 :
+   NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+             "neg.s64 \t$dst, $src;",
+             [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+ 
+ //-----------------------------------
+ // Floating Point Arithmetic
+ //-----------------------------------
+ 
+ // Constant 1.0f
+ def FloatConst1 : PatLeaf<(fpimm), [{
+   return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
+          N->getValueAPF().convertToFloat() == 1.0f;
+ }]>;
+ // Constant 1.0 (double)
+ def DoubleConst1 : PatLeaf<(fpimm), [{
+   return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
+          N->getValueAPF().convertToDouble() == 1.0;
+ }]>;
+ 
+ // Loads FP16 constant into a register.
+ //
+ // ptxas does not have hex representation for fp16, so we can't use
+ // fp16 immediate values in .f16 instructions. Instead we have to load
+ // the constant into a register using mov.b16.
+ def LOAD_CONST_F16 :
+   NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a),
+             "mov.b16 \t$dst, $a;", []>;
+ 
+ defm FADD : F3_fma_component<"add", fadd>;
+ defm FSUB : F3_fma_component<"sub", fsub>;
+ defm FMUL : F3_fma_component<"mul", fmul>;
+ 
+ defm FMIN : F3<"min", fminnum>;
+ defm FMAX : F3<"max", fmaxnum>;
+ 
+ defm FABS  : F2<"abs", fabs>;
+ defm FNEG  : F2<"neg", fneg>;
+ defm FSQRT : F2<"sqrt.rn", fsqrt>;
+ 
+ //
+ // F64 division
+ //
+ def FDIV641r :
+   NVPTXInst<(outs Float64Regs:$dst),
+             (ins f64imm:$a, Float64Regs:$b),
+             "rcp.rn.f64 \t$dst, $b;",
+             [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+ def FDIV64rr :
+   NVPTXInst<(outs Float64Regs:$dst),
+             (ins Float64Regs:$a, Float64Regs:$b),
+             "div.rn.f64 \t$dst, $a, $b;",
+             [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+ def FDIV64ri :
+   NVPTXInst<(outs Float64Regs:$dst),
+             (ins Float64Regs:$a, f64imm:$b),
+             "div.rn.f64 \t$dst, $a, $b;",
+             [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
+ 
+ //
+ // F32 Approximate reciprocal
+ //
+ def FDIV321r_ftz :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins f32imm:$a, Float32Regs:$b),
+             "rcp.approx.ftz.f32 \t$dst, $b;",
+             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+             Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+ def FDIV321r :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins f32imm:$a, Float32Regs:$b),
+             "rcp.approx.f32 \t$dst, $b;",
+             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+             Requires<[do_DIVF32_APPROX]>;
+ //
+ // F32 Approximate division
+ //
+ def FDIV32approxrr_ftz :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, Float32Regs:$b),
+             "div.approx.ftz.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+             Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+ def FDIV32approxri_ftz :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, f32imm:$b),
+             "div.approx.ftz.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+             Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+ def FDIV32approxrr :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, Float32Regs:$b),
+             "div.approx.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+             Requires<[do_DIVF32_APPROX]>;
+ def FDIV32approxri :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, f32imm:$b),
+             "div.approx.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+             Requires<[do_DIVF32_APPROX]>;
+ //
+ // F32 Semi-accurate reciprocal
+ //
+ // rcp.approx gives the same result as div.full(1.0f, a) and is faster.
+ //
+ def FDIV321r_approx_ftz :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins f32imm:$a, Float32Regs:$b),
+             "rcp.approx.ftz.f32 \t$dst, $b;",
+             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+             Requires<[do_DIVF32_FULL, doF32FTZ]>;
+ def FDIV321r_approx :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins f32imm:$a, Float32Regs:$b),
+             "rcp.approx.f32 \t$dst, $b;",
+             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+             Requires<[do_DIVF32_FULL]>;
+ //
+ // F32 Semi-accurate division
+ //
+ def FDIV32rr_ftz :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, Float32Regs:$b),
+             "div.full.ftz.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+             Requires<[do_DIVF32_FULL, doF32FTZ]>;
+ def FDIV32ri_ftz :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, f32imm:$b),
+             "div.full.ftz.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+             Requires<[do_DIVF32_FULL, doF32FTZ]>;
+ def FDIV32rr :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, Float32Regs:$b),
+             "div.full.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+             Requires<[do_DIVF32_FULL]>;
+ def FDIV32ri :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, f32imm:$b),
+             "div.full.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+             Requires<[do_DIVF32_FULL]>;
+ //
+ // F32 Accurate reciprocal
+ //
+ def FDIV321r_prec_ftz :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins f32imm:$a, Float32Regs:$b),
+             "rcp.rn.ftz.f32 \t$dst, $b;",
+             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+             Requires<[doF32FTZ]>;
+ def FDIV321r_prec :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins f32imm:$a, Float32Regs:$b),
+             "rcp.rn.f32 \t$dst, $b;",
+             [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
+ //
+ // F32 Accurate division
+ //
+ def FDIV32rr_prec_ftz :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, Float32Regs:$b),
+             "div.rn.ftz.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+             Requires<[doF32FTZ]>;
+ def FDIV32ri_prec_ftz :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, f32imm:$b),
+             "div.rn.ftz.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+             Requires<[doF32FTZ]>;
+ def FDIV32rr_prec :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, Float32Regs:$b),
+             "div.rn.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
+ def FDIV32ri_prec :
+   NVPTXInst<(outs Float32Regs:$dst),
+             (ins Float32Regs:$a, f32imm:$b),
+             "div.rn.f32 \t$dst, $a, $b;",
+             [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
+ 
+ //
+ // FMA
+ //
+ 
+ multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
+    def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
+                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                        [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
+                        Requires<[Pred]>;
+    def rri : NVPTXInst<(outs RC:$dst),
+                        (ins RC:$a, RC:$b, ImmCls:$c),
+                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                        [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
+                        Requires<[Pred]>;
+    def rir : NVPTXInst<(outs RC:$dst),
+                        (ins RC:$a, ImmCls:$b, RC:$c),
+                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                        [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
+                        Requires<[Pred]>;
+    def rii : NVPTXInst<(outs RC:$dst),
+                        (ins RC:$a, ImmCls:$b, ImmCls:$c),
+                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                        [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
+                        Requires<[Pred]>;
+ }
+ 
+ multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
+    def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
+                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                        [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
+                        Requires<[useFP16Math, Pred]>;
+ }
+ 
+ defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
+ defm FMA16     : FMA_F16<"fma.rn.f16", Float16Regs, true>;
+ defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
+ defm FMA16x2     : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>;
+ defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
+ defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
+ defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
+ 
+ // sin/cos
+ def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                       "sin.approx.f32 \t$dst, $src;",
+                       [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
+                       Requires<[allowUnsafeFPMath]>;
+ def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                       "cos.approx.f32 \t$dst, $src;",
+                       [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
+                       Requires<[allowUnsafeFPMath]>;
+ 
+ // Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
+ // i.e. "poor man's fmod()"
+ 
+ // frem - f32 FTZ
+ def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+           (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
+             (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
+              Float32Regs:$y))>,
+           Requires<[doF32FTZ]>;
+ def : Pat<(frem Float32Regs:$x, fpimm:$y),
+           (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
+             (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
+              fpimm:$y))>,
+           Requires<[doF32FTZ]>;
+ 
+ // frem - f32
+ def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+           (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
+             (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
+              Float32Regs:$y))>;
+ def : Pat<(frem Float32Regs:$x, fpimm:$y),
+           (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
+             (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
+              fpimm:$y))>;
+ 
+ // frem - f64
+ def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
+           (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
+             (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
+              Float64Regs:$y))>;
+ def : Pat<(frem Float64Regs:$x, fpimm:$y),
+           (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
+             (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
+              fpimm:$y))>;
+ 
+ //-----------------------------------
+ // Bitwise operations
+ //-----------------------------------
+ 
+ // Template for three-arg bitwise operations.  Takes three args, Creates .b16,
+ // .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
+ multiclass BITWISE<string OpcStr, SDNode OpNode> {
+   def b1rr :
+     NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+               !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+   def b1ri :
+     NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+               !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+   def b16rr :
+     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+               !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+   def b16ri :
+     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+               !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+   def b32rr :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+   def b32ri :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+   def b64rr :
+     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+               !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+   def b64ri :
+     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+               !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+ }
+ 
+ defm OR  : BITWISE<"or", or>;
+ defm AND : BITWISE<"and", and>;
+ defm XOR : BITWISE<"xor", xor>;
+ 
+ def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+                       "not.pred \t$dst, $src;",
+                       [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
+ def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                       "not.b16 \t$dst, $src;",
+                       [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
+ def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                       "not.b32 \t$dst, $src;",
+                       [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
+ def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                        "not.b64 \t$dst, $src;",
+                        [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
+ 
+ // Template for left/right shifts.  Takes three operands,
+ //   [dest (reg), src (reg), shift (reg or imm)].
+ // dest and src may be int64, int32, or int16, but shift is always int32.
+ //
+ // This template also defines a 32-bit shift (imm, imm) instruction.
+ multiclass SHIFT<string OpcStr, SDNode OpNode> {
+    def i64rr :
+      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
+                !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
+    def i64ri :
+      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+                !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
+    def i32rr :
+      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+    def i32ri :
+      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
+    def i32ii :
+      NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+                !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
+    def i16rr :
+      NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
+                !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
+    def i16ri :
+      NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
+ }
+ 
+ defm SHL : SHIFT<"shl.b", shl>;
+ defm SRA : SHIFT<"shr.s", sra>;
+ defm SRL : SHIFT<"shr.u", srl>;
+ 
+ // Bit-reverse
+ def BREV32 :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
+              "brev.b32 \t$dst, $a;",
+              [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>;
+ def BREV64 :
+   NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
+              "brev.b64 \t$dst, $a;",
+              [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
+ 
+ //
+ // Rotate: Use ptx shf instruction if available.
+ //
+ 
+ // 32 bit r2 = rotl r1, n
+ //    =>
+ //        r2 = shf.l r1, r1, n
+ def ROTL32imm_hw :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+             "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+             [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
+            Requires<[hasHWROT32]>;
+ 
+ def ROTL32reg_hw :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+             "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+             [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+            Requires<[hasHWROT32]>;
+ 
+ // 32 bit r2 = rotr r1, n
+ //    =>
+ //        r2 = shf.r r1, r1, n
+ def ROTR32imm_hw :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+             "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+             [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
+            Requires<[hasHWROT32]>;
+ 
+ def ROTR32reg_hw :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+             "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+             [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+            Requires<[hasHWROT32]>;
+ 
+ // 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
+ def ROT32imm_sw :
+   NVPTXInst<(outs Int32Regs:$dst),
+             (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+             "{{\n\t"
+             ".reg .b32 %lhs;\n\t"
+             ".reg .b32 %rhs;\n\t"
+             "shl.b32 \t%lhs, $src, $amt1;\n\t"
+             "shr.b32 \t%rhs, $src, $amt2;\n\t"
+             "add.u32 \t$dst, %lhs, %rhs;\n\t"
+             "}}",
+             []>;
+ 
+ def SUB_FRM_32 : SDNodeXForm<imm, [{
+   return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
+ }]>;
+ 
+ def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
+           (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+       Requires<[noHWROT32]>;
+ def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
+           (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
+       Requires<[noHWROT32]>;
+ 
+ // 32-bit software rotate left by register.
+ def ROTL32reg_sw :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+             "{{\n\t"
+             ".reg .b32 %lhs;\n\t"
+             ".reg .b32 %rhs;\n\t"
+             ".reg .b32 %amt2;\n\t"
+             "shl.b32 \t%lhs, $src, $amt;\n\t"
+             "sub.s32 \t%amt2, 32, $amt;\n\t"
+             "shr.b32 \t%rhs, $src, %amt2;\n\t"
+             "add.u32 \t$dst, %lhs, %rhs;\n\t"
+             "}}",
+             [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+            Requires<[noHWROT32]>;
+ 
+ // 32-bit software rotate right by register.
+ def ROTR32reg_sw :
+   NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+             "{{\n\t"
+             ".reg .b32 %lhs;\n\t"
+             ".reg .b32 %rhs;\n\t"
+             ".reg .b32 %amt2;\n\t"
+             "shr.b32 \t%lhs, $src, $amt;\n\t"
+             "sub.s32 \t%amt2, 32, $amt;\n\t"
+             "shl.b32 \t%rhs, $src, %amt2;\n\t"
+             "add.u32 \t$dst, %lhs, %rhs;\n\t"
+             "}}",
+             [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+            Requires<[noHWROT32]>;
+ 
+ // 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
+ def ROT64imm_sw :
+   NVPTXInst<(outs Int64Regs:$dst),
+             (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
+             "{{\n\t"
+             ".reg .b64 %lhs;\n\t"
+             ".reg .b64 %rhs;\n\t"
+             "shl.b64 \t%lhs, $src, $amt1;\n\t"
+             "shr.b64 \t%rhs, $src, $amt2;\n\t"
+             "add.u64 \t$dst, %lhs, %rhs;\n\t"
+             "}}",
+             []>;
+ 
+ def SUB_FRM_64 : SDNodeXForm<imm, [{
+     return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
+ }]>;
+ 
+ def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
+           (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
+ def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
+           (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
+ 
+ // 64-bit software rotate left by register.
+ def ROTL64reg_sw :
+   NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+             "{{\n\t"
+             ".reg .b64 %lhs;\n\t"
+             ".reg .b64 %rhs;\n\t"
+             ".reg .u32 %amt2;\n\t"
+             "shl.b64 \t%lhs, $src, $amt;\n\t"
+             "sub.u32 \t%amt2, 64, $amt;\n\t"
+             "shr.b64 \t%rhs, $src, %amt2;\n\t"
+             "add.u64 \t$dst, %lhs, %rhs;\n\t"
+             "}}",
+             [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
+ 
+ def ROTR64reg_sw :
+   NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+             "{{\n\t"
+             ".reg .b64 %lhs;\n\t"
+             ".reg .b64 %rhs;\n\t"
+             ".reg .u32 %amt2;\n\t"
+             "shr.b64 \t%lhs, $src, $amt;\n\t"
+             "sub.u32 \t%amt2, 64, $amt;\n\t"
+             "shl.b64 \t%rhs, $src, %amt2;\n\t"
+             "add.u64 \t$dst, %lhs, %rhs;\n\t"
+             "}}",
+             [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+ 
+ //
+ // Funnnel shift in clamp mode
+ //
+ 
+ // Create SDNodes so they can be used in the DAG code, e.g.
+ // NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+ def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+ def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+ 
+ def FUNSHFLCLAMP :
+   NVPTXInst<(outs Int32Regs:$dst),
+             (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+             "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+             [(set Int32Regs:$dst,
+               (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
+ 
+ def FUNSHFRCLAMP :
+   NVPTXInst<(outs Int32Regs:$dst),
+             (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+             "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+             [(set Int32Regs:$dst,
+              (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
+ 
+ //
+ // BFE - bit-field extract
+ //
+ 
+ // Template for BFE instructions.  Takes four args,
+ //   [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
+ // Start may be an imm only if end is also an imm.  FIXME: Is this a
+ // restriction in PTX?
+ //
+ // dest and src may be int32 or int64, but start and end are always int32.
+ multiclass BFE<string TyStr, RegisterClass RC> {
+   def rrr
+     : NVPTXInst<(outs RC:$d),
+                 (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
+                 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+   def rri
+     : NVPTXInst<(outs RC:$d),
+                 (ins RC:$a, Int32Regs:$b, i32imm:$c),
+                 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+   def rii
+     : NVPTXInst<(outs RC:$d),
+                 (ins RC:$a, i32imm:$b, i32imm:$c),
+                 !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+ }
+ 
+ let hasSideEffects = 0 in {
+   defm BFE_S32 : BFE<"s32", Int32Regs>;
+   defm BFE_U32 : BFE<"u32", Int32Regs>;
+   defm BFE_S64 : BFE<"s64", Int64Regs>;
+   defm BFE_U64 : BFE<"u64", Int64Regs>;
+ }
+ 
+ //-----------------------------------
+ // Comparison instructions (setp, set)
+ //-----------------------------------
+ 
+ // FIXME: This doesn't cover versions of set and setp that combine with a
+ // boolean predicate, e.g. setp.eq.and.b16.
+ 
+ let hasSideEffects = 0 in {
+   multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+     def rr :
+       NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
+                 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                            " \t$dst, $a, $b;"), []>;
+     def ri :
+       NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+                 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                            " \t$dst, $a, $b;"), []>;
+     def ir :
+       NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+                 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                            " \t$dst, $a, $b;"), []>;
+   }
+ }
+ 
+ defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
+ defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
+ defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
+ defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
+ defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
+ defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
+ defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
+ defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
+ defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
+ defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
+ defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
+ def SETP_f16rr :
+       NVPTXInst<(outs Int1Regs:$dst),
+                 (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
+                 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
+                 []>, Requires<[useFP16Math]>;
+ 
+ def SETP_f16x2rr :
+       NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
+                 (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
+                 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
+                 []>,
+                 Requires<[useFP16Math]>;
+ 
+ 
+ // FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
+ // "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
+ // reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
+ 
+ let hasSideEffects = 0 in {
+   multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
+     def rr : NVPTXInst<(outs Int32Regs:$dst),
+                        (ins RC:$a, RC:$b, CmpMode:$cmp),
+                        !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
+     def ri : NVPTXInst<(outs Int32Regs:$dst),
+                        (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+                        !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
+     def ir : NVPTXInst<(outs Int32Regs:$dst),
+                        (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+                        !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
+   }
+ }
+ 
+ defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
+ defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
+ defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
+ defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
+ defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
+ defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
+ defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
+ defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
+ defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
+ defm SET_f16 : SET<"f16", Float16Regs, f16imm>;
+ defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
+ defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
+ 
+ //-----------------------------------
+ // Selection instructions (selp)
+ //-----------------------------------
+ 
+ // FIXME: Missing slct
+ 
+ // selp instructions that don't have any pattern matches; we explicitly use
+ // them within this file.
+ let hasSideEffects = 0 in {
+   multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+     def rr : NVPTXInst<(outs RC:$dst),
+                        (ins RC:$a, RC:$b, Int1Regs:$p),
+                        !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
+     def ri : NVPTXInst<(outs RC:$dst),
+                        (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                        !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
+     def ir : NVPTXInst<(outs RC:$dst),
+                        (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                        !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
+     def ii : NVPTXInst<(outs RC:$dst),
+                        (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                        !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
+   }
+ 
+   multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
+                           SDNode ImmNode> {
+     def rr :
+       NVPTXInst<(outs RC:$dst),
+                 (ins RC:$a, RC:$b, Int1Regs:$p),
+                 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
+                 [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
+     def ri :
+       NVPTXInst<(outs RC:$dst),
+                 (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
+                 [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
+     def ir :
+       NVPTXInst<(outs RC:$dst),
+                 (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
+                 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
+     def ii :
+       NVPTXInst<(outs RC:$dst),
+                 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
+                 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+   }
+ }
+ 
+ // Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
+ // good.
+ defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
+ defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
+ defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
+ defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
+ defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
+ defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
+ defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
+ defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
+ defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
+ defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;
+ defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
+ defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
+ 
+ def SELP_f16x2rr :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
+               "selp.b32 \t$dst, $a, $b, $p;",
+               [(set Float16x2Regs:$dst,
+                     (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;
+ 
+ //-----------------------------------
+ // Data Movement (Load / Store, Move)
+ //-----------------------------------
+ 
+ def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
+                             [SDNPWantRoot]>;
+ def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
+                               [SDNPWantRoot]>;
+ def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
+ 
+ def MEMri : Operand<i32> {
+   let PrintMethod = "printMemOperand";
+   let MIOperandInfo = (ops Int32Regs, i32imm);
+ }
+ def MEMri64 : Operand<i64> {
+   let PrintMethod = "printMemOperand";
+   let MIOperandInfo = (ops Int64Regs, i64imm);
+ }
+ 
+ def imem : Operand<iPTR> {
+   let PrintMethod = "printOperand";
+ }
+ 
+ def imemAny : Operand<iPTRAny> {
+   let PrintMethod = "printOperand";
+ }
+ 
+ def LdStCode : Operand<i32> {
+   let PrintMethod = "printLdStCode";
+ }
+ 
+ def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+ def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
+ 
+ // Load a memory address into a u32 or u64 register.
+ def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
+                          "mov.u32 \t$dst, $a;",
+                          [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+ def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
+                            "mov.u64 \t$dst, $a;",
+                            [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+ 
+ // Get pointer to local stack.
+ let hasSideEffects = 0 in {
+   def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
+                                      "mov.u32 \t$d, __local_depot$num;", []>;
+   def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
+                                     "mov.u64 \t$d, __local_depot$num;", []>;
+ }
+ 
+ 
+ // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
+ let IsSimpleMove=1, hasSideEffects=0 in {
+   def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
+                            "mov.pred \t$dst, $sss;", []>;
+   def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+                            "mov.u16 \t$dst, $sss;", []>;
+   def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+                            "mov.u32 \t$dst, $sss;", []>;
+   def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+                            "mov.u64 \t$dst, $sss;", []>;
+ 
+   def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src),
+                            // We have to use .b16 here as there's no mov.f16.
+                            "mov.b16 \t$dst, $src;", []>;
+   def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                            "mov.f32 \t$dst, $src;", []>;
+   def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
+                            "mov.f64 \t$dst, $src;", []>;
+ }
+ 
+ def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+                         "mov.pred \t$dst, $src;",
+                         [(set Int1Regs:$dst, imm:$src)]>;
+ def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+                          "mov.u16 \t$dst, $src;",
+                          [(set Int16Regs:$dst, imm:$src)]>;
+ def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+                          "mov.u32 \t$dst, $src;",
+                          [(set Int32Regs:$dst, imm:$src)]>;
+ def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+                         "mov.u64 \t$dst, $src;",
+                         [(set Int64Regs:$dst, imm:$src)]>;
+ 
+ def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+                          "mov.f32 \t$dst, $src;",
+                          [(set Float32Regs:$dst, fpimm:$src)]>;
+ def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+                          "mov.f64 \t$dst, $src;",
+                          [(set Float64Regs:$dst, fpimm:$src)]>;
+ 
+ def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
+ 
+ //---- Copy Frame Index ----
+ def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
+                             "add.u32 \t$dst, ${addr:add};",
+                             [(set Int32Regs:$dst, ADDRri:$addr)]>;
+ def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
+                             "add.u64 \t$dst, ${addr:add};",
+                             [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+ 
+ //-----------------------------------
+ // Comparison and Selection
+ //-----------------------------------
+ 
+ multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
+                        Instruction setp_16rr,
+                        Instruction setp_16ri,
+                        Instruction setp_16ir,
+                        Instruction setp_32rr,
+                        Instruction setp_32ri,
+                        Instruction setp_32ir,
+                        Instruction setp_64rr,
+                        Instruction setp_64ri,
+                        Instruction setp_64ir,
+                        Instruction set_16rr,
+                        Instruction set_16ri,
+                        Instruction set_16ir,
+                        Instruction set_32rr,
+                        Instruction set_32ri,
+                        Instruction set_32ir,
+                        Instruction set_64rr,
+                        Instruction set_64ri,
+                        Instruction set_64ir> {
+   // i16 -> pred
+   def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
+             (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+   def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
+             (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
+   def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
+             (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
+   // i32 -> pred
+   def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
+             (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+   def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
+             (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
+   def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
+             (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
+   // i64 -> pred
+   def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
+             (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+   def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
+             (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
+   def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
+             (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
+ 
+   // i16 -> i32
+   def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
+             (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+   def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
+             (set_16ri Int16Regs:$a, imm:$b, Mode)>;
+   def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
+             (set_16ir imm:$a, Int16Regs:$b, Mode)>;
+   // i32 -> i32
+   def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
+             (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+   def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
+             (set_32ri Int32Regs:$a, imm:$b, Mode)>;
+   def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
+             (set_32ir imm:$a, Int32Regs:$b, Mode)>;
+   // i64 -> i32
+   def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
+             (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+   def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
+             (set_64ri Int64Regs:$a, imm:$b, Mode)>;
+   def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
+             (set_64ir imm:$a, Int64Regs:$b, Mode)>;
+ }
+ 
+ multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
+   : ISET_FORMAT<OpNode, Mode,
+                 SETP_s16rr, SETP_s16ri, SETP_s16ir,
+                 SETP_s32rr, SETP_s32ri, SETP_s32ir,
+                 SETP_s64rr, SETP_s64ri, SETP_s64ir,
+                 SET_s16rr, SET_s16ri, SET_s16ir,
+                 SET_s32rr, SET_s32ri, SET_s32ir,
+                 SET_s64rr, SET_s64ri, SET_s64ir> {
+   // TableGen doesn't like empty multiclasses.
+   def : PatLeaf<(i32 0)>;
+ }
+ 
+ multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
+   : ISET_FORMAT<OpNode, Mode,
+                 SETP_u16rr, SETP_u16ri, SETP_u16ir,
+                 SETP_u32rr, SETP_u32ri, SETP_u32ir,
+                 SETP_u64rr, SETP_u64ri, SETP_u64ir,
+                 SET_u16rr, SET_u16ri, SET_u16ir,
+                 SET_u32rr, SET_u32ri, SET_u32ir,
+                 SET_u64rr, SET_u64ri, SET_u64ir> {
+   // TableGen doesn't like empty multiclasses.
+   def : PatLeaf<(i32 0)>;
+ }
+ 
+ defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
+ defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
+ defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
+ defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
+ defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
+ defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
+ defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
+ defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
+ defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
+ defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
+ defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
+ defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
+ 
+ // i1 compares
+ def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
+           (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+ def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
+           (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+ 
+ def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
+           (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+ def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
+           (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+ 
+ // i1 compare -> i32
+ def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+           (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+ def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+           (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+ 
+ 
+ 
+ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
+   // f16 -> pred
+   def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
+             (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
+         Requires<[useFP16Math,doF32FTZ]>;
+   def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
+             (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
+         Requires<[useFP16Math]>;
+   def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
+             (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
+         Requires<[useFP16Math,doF32FTZ]>;
+   def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
+             (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
+         Requires<[useFP16Math]>;
+   def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
+             (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
+         Requires<[useFP16Math,doF32FTZ]>;
+   def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
+             (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
+         Requires<[useFP16Math]>;
+ 
+   // f32 -> pred
+   def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+             (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+         Requires<[doF32FTZ]>;
+   def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+             (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+   def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+             (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+         Requires<[doF32FTZ]>;
+   def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+             (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+   def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+             (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+         Requires<[doF32FTZ]>;
+   def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+             (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+ 
+   // f64 -> pred
+   def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
+             (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+   def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
+             (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+   def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
+             (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+ 
+   // f16 -> i32
+   def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
+             (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
+         Requires<[useFP16Math, doF32FTZ]>;
+   def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
+             (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
+         Requires<[useFP16Math]>;
+   def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
+             (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
+         Requires<[useFP16Math, doF32FTZ]>;
+   def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
+             (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
+         Requires<[useFP16Math]>;
+   def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
+             (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
+         Requires<[useFP16Math, doF32FTZ]>;
+   def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
+             (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
+         Requires<[useFP16Math]>;
+ 
+   // f32 -> i32
+   def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+             (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+         Requires<[doF32FTZ]>;
+   def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+             (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+   def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+             (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+         Requires<[doF32FTZ]>;
+   def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+             (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+   def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+             (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+         Requires<[doF32FTZ]>;
+   def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+             (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+ 
+   // f64 -> i32
+   def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
+             (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+   def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
+             (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+   def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
+             (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+ }
+ 
+ defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
+ defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
+ defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
+ defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
+ defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
+ defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
+ 
+ defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
+ defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
+ defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
+ defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
+ defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
+ defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
+ 
+ defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
+ defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
+ defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
+ defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
+ defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
+ defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
+ 
+ defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
+ defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
+ 
+ // FIXME: What is this doing here?  Can it be deleted?
+ // def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
+ //                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+ 
+ def SDTDeclareParamProfile :
+   SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+ def SDTDeclareScalarParamProfile :
+   SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+ def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+ def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
+ def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
+ def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+ def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+ def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+ def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
+ def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
+ def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+ def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+ def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
+ def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
+ def SDTCallValProfile : SDTypeProfile<1, 0, []>;
+ def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
+ def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+ def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
+ def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
+ def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
++def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
+ 
+ def DeclareParam :
+   SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def DeclareScalarParam :
+   SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def DeclareRetParam :
+   SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def DeclareRet :
+   SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def LoadParam :
+   SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
+          [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+ def LoadParamV2 :
+   SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
+          [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+ def LoadParamV4 :
+   SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
+          [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+ def PrintCall :
+   SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def PrintConvergentCall :
+   SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def PrintCallUni :
+   SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def PrintConvergentCallUni :
+   SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def StoreParam :
+   SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def StoreParamV2 :
+   SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def StoreParamV4 :
+   SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def StoreParamU32 :
+   SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def StoreParamS32 :
+   SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def CallArgBegin :
+   SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def CallArg :
+   SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def LastCallArg :
+   SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def CallArgEnd :
+   SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def CallVoid :
+   SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def Prototype :
+   SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def CallVal :
+   SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def MoveParam :
+   SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
+ def StoreRetval :
+   SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
+          [SDNPHasChain, SDNPSideEffect]>;
+ def StoreRetvalV2 :
+   SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
+          [SDNPHasChain, SDNPSideEffect]>;
+ def StoreRetvalV4 :
+   SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
+          [SDNPHasChain, SDNPSideEffect]>;
+ def PseudoUseParam :
+   SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def RETURNNode :
+   SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
+          [SDNPHasChain, SDNPSideEffect]>;
++def ProxyReg :
++  SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
++         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ 
+ let mayLoad = 1 in {
+   class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
+         NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                   !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
+                   []>;
+ 
+   class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
+         NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
+                   !strconcat("ld.param.v2", opstr,
+                              " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
+ 
+   class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
+         NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
+                         regclass:$dst4),
+                   (ins i32imm:$b),
+                   !strconcat("ld.param.v4", opstr,
+                              " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
+                   []>;
+ }
+ 
+ class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
+       NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                 !strconcat("mov", opstr, " \t$dst, retval$b;"),
+                 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
+ 
+ let mayStore = 1 in {
+   class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+         NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                   !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
+                   []>;
+ 
+   class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
+         NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
+                                i32imm:$a, i32imm:$b),
+                   !strconcat("st.param.v2", opstr,
+                              " \t[param$a+$b], {{$val, $val2}};"),
+                   []>;
+ 
+   class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
+         NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
+                                regclass:$val4, i32imm:$a,
+                                i32imm:$b),
+                   !strconcat("st.param.v4", opstr,
+                              " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
+                   []>;
+ 
+   class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
+         NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
+                   !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
+                   []>;
+ 
+   class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
+         NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
+                   !strconcat("st.param.v2", opstr,
+                              " \t[func_retval0+$a], {{$val, $val2}};"),
+                   []>;
+ 
+   class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
+         NVPTXInst<(outs),
+                   (ins regclass:$val, regclass:$val2, regclass:$val3,
+                        regclass:$val4, i32imm:$a),
+                   !strconcat("st.param.v4", opstr,
+                              " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
+                   []>;
+ }
+ 
+ let isCall=1 in {
+   multiclass CALL<string OpcStr, SDNode OpNode> {
+      def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
+        !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
+      def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
+        !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
+      def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
+        !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
+      def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
+        !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
+      def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
+        !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
+        [(OpNode (i32 4))]>;
+      def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
+        !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
+        [(OpNode (i32 5))]>;
+      def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
+        !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                             "retval5), "),
+        [(OpNode (i32 6))]>;
+      def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
+        !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                             "retval5, retval6), "),
+        [(OpNode (i32 7))]>;
+      def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
+        !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                             "retval5, retval6, retval7), "),
+        [(OpNode (i32 8))]>;
+   }
+ }
+ 
+ defm Call : CALL<"call", PrintCall>;
+ defm CallUni : CALL<"call.uni", PrintCallUni>;
+ 
+ // Convergent call instructions.  These are identical to regular calls, except
+ // they have the isConvergent bit set.
+ let isConvergent=1 in {
+   defm ConvergentCall : CALL<"call", PrintConvergentCall>;
+   defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
+ }
+ 
+ def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
+ def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
+ def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
+ def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
+ def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
+ def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
+ def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
+ def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
+ def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
+ def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
+ def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
+ def LoadParamMemF16    : LoadParamMemInst<Float16Regs, ".b16">;
+ def LoadParamMemF16x2  : LoadParamMemInst<Float16x2Regs, ".b32">;
+ def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
+ def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
+ def LoadParamMemV2F16  : LoadParamV2MemInst<Float16Regs, ".b16">;
+ def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
+ def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
+ def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
+ def LoadParamMemV4F16  : LoadParamV4MemInst<Float16Regs, ".b16">;
+ def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
+ def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
+ 
+ def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
+ def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
+ 
+ def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
+ def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
+ def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
+ def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
+ def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
+ def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
+ 
+ def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
+ def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
+ def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
+ 
+ def StoreParamF16      : StoreParamInst<Float16Regs, ".b16">;
+ def StoreParamF16x2    : StoreParamInst<Float16x2Regs, ".b32">;
+ def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
+ def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
+ def StoreParamV2F16    : StoreParamV2Inst<Float16Regs, ".b16">;
+ def StoreParamV2F16x2  : StoreParamV2Inst<Float16x2Regs, ".b32">;
+ def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
+ def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
+ def StoreParamV4F16    : StoreParamV4Inst<Float16Regs, ".b16">;
+ def StoreParamV4F16x2  : StoreParamV4Inst<Float16x2Regs, ".b32">;
+ def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
+ 
+ def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
+ def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
+ def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
+ def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
+ def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
+ def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
+ def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
+ def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
+ def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
+ def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
+ def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
+ 
+ def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
+ def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
+ def StoreRetvalF16    : StoreRetvalInst<Float16Regs, ".b16">;
+ def StoreRetvalF16x2  : StoreRetvalInst<Float16x2Regs, ".b32">;
+ def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
+ def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
+ def StoreRetvalV2F16  : StoreRetvalV2Inst<Float16Regs, ".b16">;
+ def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
+ def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
+ def StoreRetvalV4F16  : StoreRetvalV4Inst<Float16Regs, ".b16">;
+ def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
+ 
+ def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
+ def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
+ def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
+ def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
+ 
+ class CallArgInst<NVPTXRegClass regclass> :
+   NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+             [(CallArg (i32 0), regclass:$a)]>;
+ 
+ class LastCallArgInst<NVPTXRegClass regclass> :
+   NVPTXInst<(outs), (ins regclass:$a), "$a",
+             [(LastCallArg (i32 0), regclass:$a)]>;
+ 
+ def CallArgI64     : CallArgInst<Int64Regs>;
+ def CallArgI32     : CallArgInst<Int32Regs>;
+ def CallArgI16     : CallArgInst<Int16Regs>;
+ def CallArgF64     : CallArgInst<Float64Regs>;
+ def CallArgF32     : CallArgInst<Float32Regs>;
+ 
+ def LastCallArgI64 : LastCallArgInst<Int64Regs>;
+ def LastCallArgI32 : LastCallArgInst<Int32Regs>;
+ def LastCallArgI16 : LastCallArgInst<Int16Regs>;
+ def LastCallArgF64 : LastCallArgInst<Float64Regs>;
+ def LastCallArgF32 : LastCallArgInst<Float32Regs>;
+ 
+ def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
+                               [(CallArg (i32 0), (i32 imm:$a))]>;
+ def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
+                                   [(LastCallArg (i32 0), (i32 imm:$a))]>;
+ 
+ def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
+                              [(CallArg (i32 1), (i32 imm:$a))]>;
+ def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
+                                  [(LastCallArg (i32 1), (i32 imm:$a))]>;
+ 
+ def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
+                                   [(CallVoid (Wrapper tglobaladdr:$addr))]>;
+ def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
+                                   [(CallVoid Int32Regs:$addr)]>;
+ def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
+                                   [(CallVoid Int64Regs:$addr)]>;
+ def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
+                                   [(Prototype (i32 imm:$val))]>;
+ 
+ def DeclareRetMemInst :
+   NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
+             ".param .align $align .b8 retval$num[$size];",
+             [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+ def DeclareRetScalarInst :
+   NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+             ".param .b$size retval$num;",
+             [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
+ def DeclareRetRegInst :
+   NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+             ".reg .b$size retval$num;",
+             [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+ 
+ def DeclareParamInst :
+   NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
+             ".param .align $align .b8 param$a[$size];",
+             [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+ def DeclareScalarParamInst :
+   NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+             ".param .b$size param$a;",
+             [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
+ def DeclareScalarRegInst :
+   NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+             ".reg .b$size param$a;",
+             [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
+ 
+ class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
+   NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+             !strconcat("mov", asmstr, " \t$dst, $src;"),
+             [(set regclass:$dst, (MoveParam regclass:$src))]>;
+ 
+ def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
+ def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
+ def MoveParamI16 :
+   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+             "cvt.u16.u32 \t$dst, $src;",
+             [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+ def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
+ def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
+ def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">;
+ 
+ class PseudoUseParamInst<NVPTXRegClass regclass> :
+   NVPTXInst<(outs), (ins regclass:$src),
+             "// Pseudo use of $src",
+             [(PseudoUseParam regclass:$src)]>;
+ 
+ def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
+ def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
+ def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
+ def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
+ def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
+ 
++class ProxyRegInst<NVPTXRegClass regclass> :
++  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
++            "// Proxy Register pseudo instruction",
++            [(set regclass:$dst, (ProxyReg regclass:$src))]>;
++
++let isCodeGenOnly=1, isPseudo=1 in {
++  def ProxyRegI1    : ProxyRegInst<Int1Regs>;
++  def ProxyRegI16   : ProxyRegInst<Int16Regs>;
++  def ProxyRegI32   : ProxyRegInst<Int32Regs>;
++  def ProxyRegI64   : ProxyRegInst<Int64Regs>;
++  def ProxyRegF16   : ProxyRegInst<Float16Regs>;
++  def ProxyRegF32   : ProxyRegInst<Float32Regs>;
++  def ProxyRegF64   : ProxyRegInst<Float64Regs>;
++  def ProxyRegF16x2 : ProxyRegInst<Float16x2Regs>;
++}
+ 
+ //
+ // Load / Store Handling
+ //
+ multiclass LD<NVPTXRegClass regclass> {
+   def _avar : NVPTXInst<
+     (outs regclass:$dst),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, imem:$addr),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t$dst, [$addr];", []>;
+   def _areg : NVPTXInst<
+     (outs regclass:$dst),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int32Regs:$addr),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t$dst, [$addr];", []>;
+   def _areg_64 : NVPTXInst<
+     (outs regclass:$dst),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int64Regs:$addr),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t$dst, [$addr];", []>;
+   def _ari : NVPTXInst<
+     (outs regclass:$dst),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t$dst, [$addr+$offset];", []>;
+   def _ari_64 : NVPTXInst<
+     (outs regclass:$dst),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+          LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t$dst, [$addr+$offset];", []>;
+   def _asi : NVPTXInst<
+     (outs regclass:$dst),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+          LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t$dst, [$addr+$offset];", []>;
+ }
+ 
+ let mayLoad=1, hasSideEffects=0 in {
+   defm LD_i8  : LD<Int16Regs>;
+   defm LD_i16 : LD<Int16Regs>;
+   defm LD_i32 : LD<Int32Regs>;
+   defm LD_i64 : LD<Int64Regs>;
+   defm LD_f16 : LD<Float16Regs>;
+   defm LD_f16x2 : LD<Float16x2Regs>;
+   defm LD_f32 : LD<Float32Regs>;
+   defm LD_f64 : LD<Float64Regs>;
+ }
+ 
+ multiclass ST<NVPTXRegClass regclass> {
+   def _avar : NVPTXInst<
+     (outs),
+     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+          LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+     " \t[$addr], $src;", []>;
+   def _areg : NVPTXInst<
+     (outs),
+     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
+          LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+     " \t[$addr], $src;", []>;
+   def _areg_64 : NVPTXInst<
+     (outs),
+     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+          LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+     " \t[$addr], $src;", []>;
+   def _ari : NVPTXInst<
+     (outs),
+     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+          LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+     " \t[$addr+$offset], $src;", []>;
+   def _ari_64 : NVPTXInst<
+     (outs),
+     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+          LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+     " \t[$addr+$offset], $src;", []>;
+   def _asi : NVPTXInst<
+     (outs),
+     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+          LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+     " \t[$addr+$offset], $src;", []>;
+ }
+ 
+ let mayStore=1, hasSideEffects=0 in {
+   defm ST_i8  : ST<Int16Regs>;
+   defm ST_i16 : ST<Int16Regs>;
+   defm ST_i32 : ST<Int32Regs>;
+   defm ST_i64 : ST<Int64Regs>;
+   defm ST_f16 : ST<Float16Regs>;
+   defm ST_f16x2 : ST<Float16x2Regs>;
+   defm ST_f32 : ST<Float32Regs>;
+   defm ST_f64 : ST<Float64Regs>;
+ }
+ 
+ // The following is used only in and after vector elementizations.  Vector
+ // elementization happens at the machine instruction level, so the following
+ // instructions never appear in the DAG.
+ multiclass LD_VEC<NVPTXRegClass regclass> {
+   def _v2_avar : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, imem:$addr),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2}}, [$addr];", []>;
+   def _v2_areg : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int32Regs:$addr),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2}}, [$addr];", []>;
+   def _v2_areg_64 : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int64Regs:$addr),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2}}, [$addr];", []>;
+   def _v2_ari : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+   def _v2_ari_64 : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+   def _v2_asi : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+   def _v4_avar : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, imem:$addr),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+   def _v4_areg : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int32Regs:$addr),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+   def _v4_areg_64 : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int64Regs:$addr),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+   def _v4_ari : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+   def _v4_ari_64 : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+   def _v4_asi : NVPTXInst<
+     (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+     "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+ }
+ let mayLoad=1, hasSideEffects=0 in {
+   defm LDV_i8  : LD_VEC<Int16Regs>;
+   defm LDV_i16 : LD_VEC<Int16Regs>;
+   defm LDV_i32 : LD_VEC<Int32Regs>;
+   defm LDV_i64 : LD_VEC<Int64Regs>;
+   defm LDV_f16 : LD_VEC<Float16Regs>;
+   defm LDV_f16x2 : LD_VEC<Float16x2Regs>;
+   defm LDV_f32 : LD_VEC<Float32Regs>;
+   defm LDV_f64 : LD_VEC<Float64Regs>;
+ }
+ 
+ multiclass ST_VEC<NVPTXRegClass regclass> {
+   def _v2_avar : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t[$addr], {{$src1, $src2}};", []>;
+   def _v2_areg : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t[$addr], {{$src1, $src2}};", []>;
+   def _v2_areg_64 : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t[$addr], {{$src1, $src2}};", []>;
+   def _v2_ari : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
+          i32imm:$offset),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t[$addr+$offset], {{$src1, $src2}};", []>;
+   def _v2_ari_64 : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
+          i32imm:$offset),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t[$addr+$offset], {{$src1, $src2}};", []>;
+   def _v2_asi : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+          LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
+          i32imm:$offset),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t[$addr+$offset], {{$src1, $src2}};", []>;
+   def _v4_avar : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, imem:$addr),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+   def _v4_areg : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int32Regs:$addr),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+   def _v4_areg_64 : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int64Regs:$addr),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+   def _v4_ari : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+   def _v4_ari_64 : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+     "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+   def _v4_asi : NVPTXInst<
+     (outs),
+     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+          LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+          i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+     "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+     "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+ }
+ 
+ let mayStore=1, hasSideEffects=0 in {
+   defm STV_i8  : ST_VEC<Int16Regs>;
+   defm STV_i16 : ST_VEC<Int16Regs>;
+   defm STV_i32 : ST_VEC<Int32Regs>;
+   defm STV_i64 : ST_VEC<Int64Regs>;
+   defm STV_f16 : ST_VEC<Float16Regs>;
+   defm STV_f16x2 : ST_VEC<Float16x2Regs>;
+   defm STV_f32 : ST_VEC<Float32Regs>;
+   defm STV_f64 : ST_VEC<Float64Regs>;
+ }
+ 
+ //---- Conversion ----
+ 
+ class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
+   NVPTXRegClass regclassOut> :
+            NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
+            !strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")),
+      [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
+ 
+ def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;
+ def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>;
+ def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
+ def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
+ def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
+ def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
+ def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
+ def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
+ 
+ // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
+ // we cannot specify floating-point literals in isel patterns.  Therefore, we
+ // use an integer selp to select either 1 or 0 and then cvt to floating-point.
+ 
+ // sint -> f16
+ def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
+           (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+ def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
+           (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
+ def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
+           (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
+ def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
+           (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
+ 
+ // uint -> f16
+ def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
+           (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+ def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
+           (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
+ def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
+           (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
+ def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
+           (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
+ 
+ // sint -> f32
+ def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
+           (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+ def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
+           (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
+ def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
+           (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+ def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
+           (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+ 
+ // uint -> f32
+ def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
+           (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+ def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
+           (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
+ def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
+           (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+ def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
+           (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+ 
+ // sint -> f64
+ def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
+           (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+ def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
+           (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
+ def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
+           (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+ def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
+           (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+ 
+ // uint -> f64
+ def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
+           (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+ def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
+           (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
+ def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
+           (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+ def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
+           (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+ 
+ 
+ // f16 -> sint
+ def : Pat<(i1 (fp_to_sint Float16Regs:$a)),
+           (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
+ def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
+           (CVT_s16_f16 Float16Regs:$a, CvtRZI)>;
+ def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
+           (CVT_s32_f16 Float16Regs:$a, CvtRZI)>;
+ def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
+           (CVT_s64_f16 Float16Regs:$a, CvtRZI)>;
+ 
+ // f16 -> uint
+ def : Pat<(i1 (fp_to_uint Float16Regs:$a)),
+           (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
+ def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
+           (CVT_u16_f16 Float16Regs:$a, CvtRZI)>;
+ def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
+           (CVT_u32_f16 Float16Regs:$a, CvtRZI)>;
+ def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
+           (CVT_u64_f16 Float16Regs:$a, CvtRZI)>;
+ 
+ // f32 -> sint
+ def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
+           (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+ def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+           (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+           (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
+ def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+           (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+           (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+ def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+           (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+           (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+ 
+ // f32 -> uint
+ def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
+           (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+ def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+           (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+           (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
+ def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+           (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+           (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+ def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+           (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+           (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+ 
+ // f64 -> sint
+ def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
+           (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+ def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
+           (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
+ def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
+           (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+ def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
+           (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+ 
+ // f64 -> uint
+ def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
+           (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+ def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
+           (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
+ def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
+           (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+ def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
+           (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+ 
+ // sext i1
+ def : Pat<(i16 (sext Int1Regs:$a)),
+           (SELP_s16ii -1, 0, Int1Regs:$a)>;
+ def : Pat<(i32 (sext Int1Regs:$a)),
+           (SELP_s32ii -1, 0, Int1Regs:$a)>;
+ def : Pat<(i64 (sext Int1Regs:$a)),
+           (SELP_s64ii -1, 0, Int1Regs:$a)>;
+ 
+ // zext i1
+ def : Pat<(i16 (zext Int1Regs:$a)),
+           (SELP_u16ii 1, 0, Int1Regs:$a)>;
+ def : Pat<(i32 (zext Int1Regs:$a)),
+           (SELP_u32ii 1, 0, Int1Regs:$a)>;
+ def : Pat<(i64 (zext Int1Regs:$a)),
+           (SELP_u64ii 1, 0, Int1Regs:$a)>;
+ 
+ // anyext i1
+ def : Pat<(i16 (anyext Int1Regs:$a)),
+           (SELP_u16ii -1, 0, Int1Regs:$a)>;
+ def : Pat<(i32 (anyext Int1Regs:$a)),
+           (SELP_u32ii -1, 0, Int1Regs:$a)>;
+ def : Pat<(i64 (anyext Int1Regs:$a)),
+           (SELP_u64ii -1, 0, Int1Regs:$a)>;
+ 
+ // sext i16
+ def : Pat<(i32 (sext Int16Regs:$a)),
+           (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
+ def : Pat<(i64 (sext Int16Regs:$a)),
+           (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
+ 
+ // zext i16
+ def : Pat<(i32 (zext Int16Regs:$a)),
+           (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+ def : Pat<(i64 (zext Int16Regs:$a)),
+           (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+ 
+ // anyext i16
+ def : Pat<(i32 (anyext Int16Regs:$a)),
+           (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+ def : Pat<(i64 (anyext Int16Regs:$a)),
+           (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+ 
+ // sext i32
+ def : Pat<(i64 (sext Int32Regs:$a)),
+           (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
+ 
+ // zext i32
+ def : Pat<(i64 (zext Int32Regs:$a)),
+           (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+ 
+ // anyext i32
+ def : Pat<(i64 (anyext Int32Regs:$a)),
+           (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+ 
+ 
+ // truncate i64
+ def : Pat<(i32 (trunc Int64Regs:$a)),
+           (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
+ def : Pat<(i16 (trunc Int64Regs:$a)),
+           (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
+ def : Pat<(i1 (trunc Int64Regs:$a)),
+           (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
+ 
+ // truncate i32
+ def : Pat<(i16 (trunc Int32Regs:$a)),
+           (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
+ def : Pat<(i1 (trunc Int32Regs:$a)),
+           (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
+ 
+ // truncate i16
+ def : Pat<(i1 (trunc Int16Regs:$a)),
+           (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
+ 
+ // sext_inreg
+ def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
+ def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
+ def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
+ def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
+ def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
+ def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
+ 
+ 
+ // Select instructions with 32-bit predicates
+ def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
+           (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
+           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+ def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
+           (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
+           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+ def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
+           (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
+           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+ def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),
+           (SELP_f16rr Float16Regs:$a, Float16Regs:$b,
+           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+ def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
+           (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
+           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+ def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
+           (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
+           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+ 
+ 
+ let hasSideEffects = 0 in {
+   // pack a set of smaller int registers to a larger int register
+   def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
+                              (ins Int16Regs:$s1, Int16Regs:$s2,
+                                   Int16Regs:$s3, Int16Regs:$s4),
+                              "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
+   def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
+                              (ins Int16Regs:$s1, Int16Regs:$s2),
+                              "mov.b32 \t$d, {{$s1, $s2}};", []>;
+   def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
+                              (ins Int32Regs:$s1, Int32Regs:$s2),
+                              "mov.b64 \t$d, {{$s1, $s2}};", []>;
+   def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
+                              (ins Float32Regs:$s1, Float32Regs:$s2),
+                              "mov.b64 \t$d, {{$s1, $s2}};", []>;
+ 
+   // unpack a larger int register to a set of smaller int registers
+   def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+                                    Int16Regs:$d3, Int16Regs:$d4),
+                              (ins Int64Regs:$s),
+                              "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
+   def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
+                              (ins Int32Regs:$s),
+                              "mov.b32 \t{{$d1, $d2}}, $s;", []>;
+   def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
+                              (ins Int64Regs:$s),
+                              "mov.b64 \t{{$d1, $d2}}, $s;", []>;
+   def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+                              (ins Float64Regs:$s),
+                              "mov.b64 \t{{$d1, $d2}}, $s;", []>;
+ 
+ }
+ 
+ let hasSideEffects = 0 in {
+   // Extract element of f16x2 register. PTX does not provide any way
+   // to access elements of f16x2 vector directly, so we need to
+   // extract it using a temporary register.
+   def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
+                                (ins Float16x2Regs:$src),
+                                "{{ .reg .b16 \t%tmp_hi;\n\t"
+                                "  mov.b32 \t{$dst, %tmp_hi}, $src; }}",
+                                [(set Float16Regs:$dst,
+                                  (extractelt (v2f16 Float16x2Regs:$src), 0))]>;
+   def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
+                                (ins Float16x2Regs:$src),
+                                "{{ .reg .b16 \t%tmp_lo;\n\t"
+                                "  mov.b32 \t{%tmp_lo, $dst}, $src; }}",
+                                [(set Float16Regs:$dst,
+                                  (extractelt (v2f16 Float16x2Regs:$src), 1))]>;
+ 
+   // Coalesce two f16 registers into f16x2
+   def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
+                              (ins Float16Regs:$a, Float16Regs:$b),
+                              "mov.b32 \t$dst, {{$a, $b}};",
+                              [(set Float16x2Regs:$dst,
+                                (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
+ 
+   // Directly initializing underlying the b32 register is one less SASS
+   // instruction than than vector-packing move.
+   def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
+                               "mov.b32 \t$dst, $src;",
+                               []>;
+ 
+   // Split f16x2 into two f16 registers.
+   def SplitF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
+                               (ins Float16x2Regs:$src),
+                               "mov.b32 \t{{$lo, $hi}}, $src;",
+                               []>;
+   // Split an i32 into two f16
+   def SplitI32toF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
+                                    (ins Int32Regs:$src),
+                                    "mov.b32 \t{{$lo, $hi}}, $src;",
+                                    []>;
+ }
+ 
+ // Count leading zeros
+ let hasSideEffects = 0 in {
+   def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                          "clz.b32 \t$d, $a;", []>;
+   def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                          "clz.b64 \t$d, $a;", []>;
+ }
+ 
+ // 32-bit has a direct PTX instruction
+ def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
+ 
+ // The return type of the ctlz ISD node is the same as its input, but the PTX
+ // ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
+ // ptx value to 64 bits to match the ISD node's semantics, unless we know we're
+ // truncating back down to 32 bits.
+ def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+ def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
+ 
+ // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
+ // result back to 16-bits if necessary.  We also need to subtract 16 because
+ // the high-order 16 zeros were counted.
+ //
+ // TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
+ // use to save one SASS instruction (on sm_35 anyway):
+ //
+ //   mov.b32 $tmp, {0xffff, $a}
+ //   ctlz.b32 $result, $tmp
+ //
+ // That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
+ // and then ctlz that value.  This way we don't have to subtract 16 from the
+ // result.  Unfortunately today we don't have a way to generate
+ // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
+ def : Pat<(ctlz Int16Regs:$a),
+           (SUBi16ri (CVT_u16_u32
+            (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
+ def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
+           (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
+ 
+ // Population count
+ let hasSideEffects = 0 in {
+   def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                           "popc.b32 \t$d, $a;", []>;
+   def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                           "popc.b64 \t$d, $a;", []>;
+ }
+ 
+ // 32-bit has a direct PTX instruction
+ def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
+ 
+ // For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
+ // to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
+ // pattern that avoids the type conversion if we're truncating the result to
+ // i32 anyway.
+ def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+ def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
+ 
+ // For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
+ // If we know that we're storing into an i32, we can avoid the final trunc.
+ def : Pat<(ctpop Int16Regs:$a),
+           (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
+ def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
+           (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
+ 
+ // fpround f32 -> f16
+ def : Pat<(f16 (fpround Float32Regs:$a)),
+           (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+ 
+ // fpround f64 -> f16
+ def : Pat<(f16 (fpround Float64Regs:$a)),
+           (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+ 
+ // fpround f64 -> f32
+ def : Pat<(f32 (fpround Float64Regs:$a)),
+           (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(f32 (fpround Float64Regs:$a)),
+           (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+ 
+ // fpextend f16 -> f32
+ def : Pat<(f32 (fpextend Float16Regs:$a)),
+           (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(f32 (fpextend Float16Regs:$a)),
+           (CVT_f32_f16 Float16Regs:$a, CvtNONE)>;
+ 
+ // fpextend f16 -> f64
+ def : Pat<(f64 (fpextend Float16Regs:$a)),
+           (CVT_f64_f16 Float16Regs:$a, CvtNONE)>;
+ 
+ // fpextend f32 -> f64
+ def : Pat<(f64 (fpextend Float32Regs:$a)),
+           (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(f64 (fpextend Float32Regs:$a)),
+           (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
+ 
+ def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+                      [SDNPHasChain, SDNPOptInGlue]>;
+ 
+ // fceil, ffloor, fround, ftrunc.
+ 
+ def : Pat<(fceil Float16Regs:$a),
+           (CVT_f16_f16 Float16Regs:$a, CvtRPI)>;
+ def : Pat<(fceil Float32Regs:$a),
+           (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(fceil Float32Regs:$a),
+           (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
+ def : Pat<(fceil Float64Regs:$a),
+           (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
+ 
+ def : Pat<(ffloor Float16Regs:$a),
+           (CVT_f16_f16 Float16Regs:$a, CvtRMI)>;
+ def : Pat<(ffloor Float32Regs:$a),
+           (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(ffloor Float32Regs:$a),
+           (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
+ def : Pat<(ffloor Float64Regs:$a),
+           (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
+ 
+ def : Pat<(f16 (fround Float16Regs:$a)),
+           (CVT_f16_f16 Float16Regs:$a, CvtRNI)>;
+ def : Pat<(fround Float32Regs:$a),
+           (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(f32 (fround Float32Regs:$a)),
+           (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+ def : Pat<(f64 (fround Float64Regs:$a)),
+           (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+ 
+ def : Pat<(ftrunc Float16Regs:$a),
+           (CVT_f16_f16 Float16Regs:$a, CvtRZI)>;
+ def : Pat<(ftrunc Float32Regs:$a),
+           (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(ftrunc Float32Regs:$a),
+           (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
+ def : Pat<(ftrunc Float64Regs:$a),
+           (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
+ 
+ // nearbyint and rint are implemented as rounding to nearest even.  This isn't
+ // strictly correct, because it causes us to ignore the rounding mode.  But it
+ // matches what CUDA's "libm" does.
+ 
+ def : Pat<(fnearbyint Float16Regs:$a),
+           (CVT_f16_f16 Float16Regs:$a, CvtRNI)>;
+ def : Pat<(fnearbyint Float32Regs:$a),
+           (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(fnearbyint Float32Regs:$a),
+           (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+ def : Pat<(fnearbyint Float64Regs:$a),
+           (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+ 
+ def : Pat<(frint Float16Regs:$a),
+           (CVT_f16_f16 Float16Regs:$a, CvtRNI)>;
+ def : Pat<(frint Float32Regs:$a),
+           (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(frint Float32Regs:$a),
+           (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+ def : Pat<(frint Float64Regs:$a),
+           (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+ 
+ 
+ //-----------------------------------
+ // Control-flow
+ //-----------------------------------
+ 
+ let isTerminator=1 in {
+    let isReturn=1, isBarrier=1 in
+       def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
+ 
+    let isBranch=1 in
+       def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                               "@$a bra \t$target;",
+                               [(brcond Int1Regs:$a, bb:$target)]>;
+    let isBranch=1 in
+       def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                                    "@!$a bra \t$target;", []>;
+ 
+    let isBranch=1, isBarrier=1 in
+       def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
+                            "bra.uni \t$target;", [(br bb:$target)]>;
+ }
+ 
+ def : Pat<(brcond Int32Regs:$a, bb:$target),
+           (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
+ 
+ // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
+ // conditional branch if the target block is the next block so that the code
+ // can fall through to the target block.  The invertion is done by 'xor
+ // condition, 1', which will be translated to (setne condition, -1).  Since ptx
+ // supports '@!pred bra target', we should use it.
+ def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
+           (CBranchOther Int1Regs:$a, bb:$target)>;
+ 
+ // Call
+ def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i32>]>;
+ def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+ 
+ def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
+                            [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+ def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                             SDNPSideEffect]>;
+ 
+ def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+ def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+ def calltarget : Operand<i32>;
+ let isCall=1 in {
+    def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
+ }
+ 
+ def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
+ def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
+ 
+ // Pseudo instructions.
+ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : NVPTXInst<outs, ins, asmstr, pattern>;
+ 
+ def Callseq_Start :
+   NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+             "\\{ // callseq $amt1, $amt2\n"
+             "\t.reg .b32 temp_param_reg;",
+             [(callseq_start timm:$amt1, timm:$amt2)]>;
+ def Callseq_End :
+   NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+             "\\} // callseq $amt1",
+             [(callseq_end timm:$amt1, timm:$amt2)]>;
+ 
+ // trap instruction
+ def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
+ 
+ // Call prototype wrapper
+ def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+ def CallPrototype :
+   SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+ def ProtoIdent : Operand<i32> {
+   let PrintMethod = "printProtoIdent";
+ }
+ def CALL_PROTOTYPE :
+   NVPTXInst<(outs), (ins ProtoIdent:$ident),
+             "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+ 
+ 
+ include "NVPTXIntrinsics.td"
+ 
+ 
+ //-----------------------------------
+ // Notes
+ //-----------------------------------
+ // BSWAP is currently expanded. The following is a more efficient
+ // - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
+ // - for sm_20, use pmpt (use vector scalar mov to get the pack and
+ //   unpack). sm_20 supports native 32-bit register, but not native 16-bit
+ // register.
+diff --git a/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp b/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
+new file mode 100644
+index 00000000000..f0f22a476bb
+--- /dev/null
++++ b/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
+@@ -0,0 +1,114 @@
++//===- NVPTXProxyRegErasure.cpp - NVPTX Proxy Register Instruction Erasure -==//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++//===----------------------------------------------------------------------===//
++//
++// The pass is needed to remove ProxyReg instructions and restore related
++// registers. The instructions were needed at instruction selection stage to
++// make sure that callseq_end nodes won't be removed as "dead nodes". This can
++// happen when we expand instructions into libcalls and the call site doesn't
++// care about the libcall chain. Call site cares about data flow only, and the
++// latest data flow node happens to be before callseq_end. Therefore the node
++// becomes dangling and "dead". The ProxyReg acts like an additional data flow
++// node *after* the callseq_end in the chain and ensures that everything will be
++// preserved.
++//
++//===----------------------------------------------------------------------===//
++
++#include "NVPTX.h"
++#include "llvm/CodeGen/MachineFunctionPass.h"
++#include "llvm/CodeGen/MachineInstrBuilder.h"
++#include "llvm/CodeGen/MachineRegisterInfo.h"
++#include "llvm/CodeGen/TargetInstrInfo.h"
++#include "llvm/CodeGen/TargetRegisterInfo.h"
++
++using namespace llvm;
++
++namespace {
++
++struct NVPTXProxyRegErasure : public MachineFunctionPass {
++public:
++  static char ID;
++  NVPTXProxyRegErasure() : MachineFunctionPass(ID) {}
++
++  bool runOnMachineFunction(MachineFunction &MF) override;
++
++  StringRef getPassName() const override {
++    return "NVPTX Proxy Register Instruction Erasure";
++  }
++
++  void getAnalysisUsage(AnalysisUsage &AU) const override {
++    MachineFunctionPass::getAnalysisUsage(AU);
++  }
++
++private:
++  void replaceMachineInstructionUsage(MachineFunction &MF, MachineInstr &MI);
++
++  void replaceRegisterUsage(MachineInstr &Instr, MachineOperand &From,
++                            MachineOperand &To);
++};
++
++} // namespace
++
++char NVPTXProxyRegErasure::ID = 0;
++
++bool NVPTXProxyRegErasure::runOnMachineFunction(MachineFunction &MF) {
++  SmallVector<MachineInstr *, 16> RemoveList;
++
++  for (auto &BB : MF) {
++    for (auto &MI : BB) {
++      switch (MI.getOpcode()) {
++      case NVPTX::ProxyRegI1:
++      case NVPTX::ProxyRegI16:
++      case NVPTX::ProxyRegI32:
++      case NVPTX::ProxyRegI64:
++      case NVPTX::ProxyRegF16:
++      case NVPTX::ProxyRegF16x2:
++      case NVPTX::ProxyRegF32:
++      case NVPTX::ProxyRegF64:
++        replaceMachineInstructionUsage(MF, MI);
++        RemoveList.push_back(&MI);
++        break;
++      }
++    }
++  }
++
++  for (auto *MI : RemoveList) {
++    MI->eraseFromParent();
++  }
++
++  return !RemoveList.empty();
++}
++
++void NVPTXProxyRegErasure::replaceMachineInstructionUsage(MachineFunction &MF,
++                                                          MachineInstr &MI) {
++  auto &InOp = *MI.uses().begin();
++  auto &OutOp = *MI.defs().begin();
++
++  assert(InOp.isReg() && "ProxyReg input operand should be a register.");
++  assert(OutOp.isReg() && "ProxyReg output operand should be a register.");
++
++  for (auto &BB : MF) {
++    for (auto &I : BB) {
++      replaceRegisterUsage(I, OutOp, InOp);
++    }
++  }
++}
++
++void NVPTXProxyRegErasure::replaceRegisterUsage(MachineInstr &Instr,
++                                                MachineOperand &From,
++                                                MachineOperand &To) {
++  for (auto &Op : Instr.uses()) {
++    if (Op.isReg() && Op.getReg() == From.getReg()) {
++      Op.setReg(To.getReg());
++    }
++  }
++}
++
++MachineFunctionPass *llvm::createNVPTXProxyRegErasurePass() {
++  return new NVPTXProxyRegErasure();
++}
+diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+index 8c009aed887..9e62915a1a0 100644
+--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
++++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+@@ -1,386 +1,392 @@
+ //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
+ //
+ //                     The LLVM Compiler Infrastructure
+ //
+ // This file is distributed under the University of Illinois Open Source
+ // License. See LICENSE.TXT for details.
+ //
+ //===----------------------------------------------------------------------===//
+ //
+ // Top-level implementation for the NVPTX target.
+ //
+ //===----------------------------------------------------------------------===//
+ 
+ #include "NVPTXTargetMachine.h"
+ #include "NVPTX.h"
+ #include "NVPTXAllocaHoisting.h"
+ #include "NVPTXLowerAggrCopies.h"
+ #include "NVPTXTargetObjectFile.h"
+ #include "NVPTXTargetTransformInfo.h"
+ #include "llvm/ADT/STLExtras.h"
+ #include "llvm/ADT/Triple.h"
+ #include "llvm/Analysis/TargetTransformInfo.h"
+ #include "llvm/CodeGen/Passes.h"
+ #include "llvm/CodeGen/TargetPassConfig.h"
+ #include "llvm/IR/LegacyPassManager.h"
+ #include "llvm/Pass.h"
+ #include "llvm/Support/CommandLine.h"
+ #include "llvm/Support/TargetRegistry.h"
+ #include "llvm/Target/TargetMachine.h"
+ #include "llvm/Target/TargetOptions.h"
+ #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+ #include "llvm/Transforms/Scalar.h"
+ #include "llvm/Transforms/Scalar/GVN.h"
+ #include "llvm/Transforms/Vectorize.h"
+ #include <cassert>
+ #include <string>
+ 
+ using namespace llvm;
+ 
+ // LSV is still relatively new; this switch lets us turn it off in case we
+ // encounter (or suspect) a bug.
+ static cl::opt<bool>
+     DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
+                                cl::desc("Disable load/store vectorizer"),
+                                cl::init(false), cl::Hidden);
+ 
+ // TODO: Remove this flag when we are confident with no regressions.
+ static cl::opt<bool> DisableRequireStructuredCFG(
+     "disable-nvptx-require-structured-cfg",
+     cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "
+              "structured CFG. The requirement should be disabled only when "
+              "unexpected regressions happen."),
+     cl::init(false), cl::Hidden);
+ 
+ static cl::opt<bool> UseShortPointersOpt(
+     "nvptx-short-ptr",
+     cl::desc(
+         "Use 32-bit pointers for accessing const/local/shared address spaces."),
+     cl::init(false), cl::Hidden);
+ 
+ namespace llvm {
+ 
+ void initializeNVVMIntrRangePass(PassRegistry&);
+ void initializeNVVMReflectPass(PassRegistry&);
+ void initializeGenericToNVVMPass(PassRegistry&);
+ void initializeNVPTXAllocaHoistingPass(PassRegistry &);
+ void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
+ void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
+ void initializeNVPTXLowerArgsPass(PassRegistry &);
+ void initializeNVPTXLowerAllocaPass(PassRegistry &);
+ 
+ } // end namespace llvm
+ 
+ extern "C" void LLVMInitializeNVPTXTarget() {
+   // Register the target.
+   RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
+   RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
+ 
+   // FIXME: This pass is really intended to be invoked during IR optimization,
+   // but it's very NVPTX-specific.
+   PassRegistry &PR = *PassRegistry::getPassRegistry();
+   initializeNVVMReflectPass(PR);
+   initializeNVVMIntrRangePass(PR);
+   initializeGenericToNVVMPass(PR);
+   initializeNVPTXAllocaHoistingPass(PR);
+   initializeNVPTXAssignValidGlobalNamesPass(PR);
+   initializeNVPTXLowerArgsPass(PR);
+   initializeNVPTXLowerAllocaPass(PR);
+   initializeNVPTXLowerAggrCopiesPass(PR);
+ }
+ 
+ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
+   std::string Ret = "e";
+ 
+   if (!is64Bit)
+     Ret += "-p:32:32";
+   else if (UseShortPointers)
+     Ret += "-p3:32:32-p4:32:32-p5:32:32";
+ 
+   Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
+ 
+   return Ret;
+ }
+ 
+ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
+                                        StringRef CPU, StringRef FS,
+                                        const TargetOptions &Options,
+                                        Optional<Reloc::Model> RM,
+                                        Optional<CodeModel::Model> CM,
+                                        CodeGenOpt::Level OL, bool is64bit)
+     // The pic relocation model is used regardless of what the client has
+     // specified, as it is the only relocation model currently supported.
+     : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
+                         CPU, FS, Options, Reloc::PIC_,
+                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
+       is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
+       TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
+       Subtarget(TT, CPU, FS, *this) {
+   if (TT.getOS() == Triple::NVCL)
+     drvInterface = NVPTX::NVCL;
+   else
+     drvInterface = NVPTX::CUDA;
+   if (!DisableRequireStructuredCFG)
+     setRequiresStructuredCFG(true);
+   initAsmInfo();
+ }
+ 
+ NVPTXTargetMachine::~NVPTXTargetMachine() = default;
+ 
+ void NVPTXTargetMachine32::anchor() {}
+ 
+ NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
+                                            StringRef CPU, StringRef FS,
+                                            const TargetOptions &Options,
+                                            Optional<Reloc::Model> RM,
+                                            Optional<CodeModel::Model> CM,
+                                            CodeGenOpt::Level OL, bool JIT)
+     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+ 
+ void NVPTXTargetMachine64::anchor() {}
+ 
+ NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
+                                            StringRef CPU, StringRef FS,
+                                            const TargetOptions &Options,
+                                            Optional<Reloc::Model> RM,
+                                            Optional<CodeModel::Model> CM,
+                                            CodeGenOpt::Level OL, bool JIT)
+     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+ 
+ namespace {
+ 
+ class NVPTXPassConfig : public TargetPassConfig {
+ public:
+   NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)
+       : TargetPassConfig(TM, PM) {}
+ 
+   NVPTXTargetMachine &getNVPTXTargetMachine() const {
+     return getTM<NVPTXTargetMachine>();
+   }
+ 
+   void addIRPasses() override;
+   bool addInstSelector() override;
++  void addPreRegAlloc() override;
+   void addPostRegAlloc() override;
+   void addMachineSSAOptimization() override;
+ 
+   FunctionPass *createTargetRegisterAllocator(bool) override;
+   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+ 
+ private:
+   // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
+   // function is only called in opt mode.
+   void addEarlyCSEOrGVNPass();
+ 
+   // Add passes that propagate special memory spaces.
+   void addAddressSpaceInferencePasses();
+ 
+   // Add passes that perform straight-line scalar optimizations.
+   void addStraightLineScalarOptimizationPasses();
+ };
+ 
+ } // end anonymous namespace
+ 
+ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
+   return new NVPTXPassConfig(*this, PM);
+ }
+ 
+ void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
+   Builder.addExtension(
+     PassManagerBuilder::EP_EarlyAsPossible,
+     [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+       PM.add(createNVVMReflectPass(Subtarget.getSmVersion()));
+       PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
+     });
+ }
+ 
+ TargetTransformInfo
+ NVPTXTargetMachine::getTargetTransformInfo(const Function &F) {
+   return TargetTransformInfo(NVPTXTTIImpl(this, F));
+ }
+ 
+ void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
+   if (getOptLevel() == CodeGenOpt::Aggressive)
+     addPass(createGVNPass());
+   else
+     addPass(createEarlyCSEPass());
+ }
+ 
+ void NVPTXPassConfig::addAddressSpaceInferencePasses() {
+   // NVPTXLowerArgs emits alloca for byval parameters which can often
+   // be eliminated by SROA.
+   addPass(createSROAPass());
+   addPass(createNVPTXLowerAllocaPass());
+   addPass(createInferAddressSpacesPass());
+ }
+ 
+ void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
+   addPass(createSeparateConstOffsetFromGEPPass());
+   addPass(createSpeculativeExecutionPass());
+   // ReassociateGEPs exposes more opportunites for SLSR. See
+   // the example in reassociate-geps-and-slsr.ll.
+   addPass(createStraightLineStrengthReducePass());
+   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
+   // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
+   // for some of our benchmarks.
+   addEarlyCSEOrGVNPass();
+   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
+   addPass(createNaryReassociatePass());
+   // NaryReassociate on GEPs creates redundant common expressions, so run
+   // EarlyCSE after it.
+   addPass(createEarlyCSEPass());
+ }
+ 
+ void NVPTXPassConfig::addIRPasses() {
+   // The following passes are known to not play well with virtual regs hanging
+   // around after register allocation (which in our case, is *all* registers).
+   // We explicitly disable them here.  We do, however, need some functionality
+   // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
+   // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
+   disablePass(&PrologEpilogCodeInserterID);
+   disablePass(&MachineCopyPropagationID);
+   disablePass(&TailDuplicateID);
+   disablePass(&StackMapLivenessID);
+   disablePass(&LiveDebugValuesID);
+   disablePass(&PostRAMachineSinkingID);
+   disablePass(&PostRASchedulerID);
+   disablePass(&FuncletLayoutID);
+   disablePass(&PatchableFunctionID);
+   disablePass(&ShrinkWrapID);
+ 
+   // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
+   // it here does nothing.  But since we need it for correctness when lowering
+   // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
+   // call addEarlyAsPossiblePasses.
+   const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
+   addPass(createNVVMReflectPass(ST.getSmVersion()));
+ 
+   if (getOptLevel() != CodeGenOpt::None)
+     addPass(createNVPTXImageOptimizerPass());
+   addPass(createNVPTXAssignValidGlobalNamesPass());
+   addPass(createGenericToNVVMPass());
+ 
+   // NVPTXLowerArgs is required for correctness and should be run right
+   // before the address space inference passes.
+   addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
+   if (getOptLevel() != CodeGenOpt::None) {
+     addAddressSpaceInferencePasses();
+     if (!DisableLoadStoreVectorizer)
+       addPass(createLoadStoreVectorizerPass());
+     addStraightLineScalarOptimizationPasses();
+   }
+ 
+   // === LSR and other generic IR passes ===
+   TargetPassConfig::addIRPasses();
+   // EarlyCSE is not always strong enough to clean up what LSR produces. For
+   // example, GVN can combine
+   //
+   //   %0 = add %a, %b
+   //   %1 = add %b, %a
+   //
+   // and
+   //
+   //   %0 = shl nsw %a, 2
+   //   %1 = shl %a, 2
+   //
+   // but EarlyCSE can do neither of them.
+   if (getOptLevel() != CodeGenOpt::None)
+     addEarlyCSEOrGVNPass();
+ }
+ 
+ bool NVPTXPassConfig::addInstSelector() {
+   const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
+ 
+   addPass(createLowerAggrCopies());
+   addPass(createAllocaHoisting());
+   addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
+ 
+   if (!ST.hasImageHandles())
+     addPass(createNVPTXReplaceImageHandlesPass());
+ 
+   return false;
+ }
+ 
++void NVPTXPassConfig::addPreRegAlloc() {
++  // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive.
++  addPass(createNVPTXProxyRegErasurePass());
++}
++
+ void NVPTXPassConfig::addPostRegAlloc() {
+   addPass(createNVPTXPrologEpilogPass(), false);
+   if (getOptLevel() != CodeGenOpt::None) {
+     // NVPTXPrologEpilogPass calculates frame object offset and replace frame
+     // index with VRFrame register. NVPTXPeephole need to be run after that and
+     // will replace VRFrame with VRFrameLocal when possible.
+     addPass(createNVPTXPeephole());
+   }
+ }
+ 
+ FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
+   return nullptr; // No reg alloc
+ }
+ 
+ void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+   assert(!RegAllocPass && "NVPTX uses no regalloc!");
+   addPass(&PHIEliminationID);
+   addPass(&TwoAddressInstructionPassID);
+ }
+ 
+ void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+   assert(!RegAllocPass && "NVPTX uses no regalloc!");
+ 
+   addPass(&ProcessImplicitDefsID);
+   addPass(&LiveVariablesID);
+   addPass(&MachineLoopInfoID);
+   addPass(&PHIEliminationID);
+ 
+   addPass(&TwoAddressInstructionPassID);
+   addPass(&RegisterCoalescerID);
+ 
+   // PreRA instruction scheduling.
+   if (addPass(&MachineSchedulerID))
+     printAndVerify("After Machine Scheduling");
+ 
+ 
+   addPass(&StackSlotColoringID);
+ 
+   // FIXME: Needs physical registers
+   //addPass(&MachineLICMID);
+ 
+   printAndVerify("After StackSlotColoring");
+ }
+ 
+ void NVPTXPassConfig::addMachineSSAOptimization() {
+   // Pre-ra tail duplication.
+   if (addPass(&EarlyTailDuplicateID))
+     printAndVerify("After Pre-RegAlloc TailDuplicate");
+ 
+   // Optimize PHIs before DCE: removing dead PHI cycles may make more
+   // instructions dead.
+   addPass(&OptimizePHIsID);
+ 
+   // This pass merges large allocas. StackSlotColoring is a different pass
+   // which merges spill slots.
+   addPass(&StackColoringID);
+ 
+   // If the target requests it, assign local variables to stack slots relative
+   // to one another and simplify frame index references where possible.
+   addPass(&LocalStackSlotAllocationID);
+ 
+   // With optimization, dead code should already be eliminated. However
+   // there is one known exception: lowered code for arguments that are only
+   // used by tail calls, where the tail calls reuse the incoming stack
+   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+   addPass(&DeadMachineInstructionElimID);
+   printAndVerify("After codegen DCE pass");
+ 
+   // Allow targets to insert passes that improve instruction level parallelism,
+   // like if-conversion. Such passes will typically need dominator trees and
+   // loop info, just like LICM and CSE below.
+   if (addILPOpts())
+     printAndVerify("After ILP optimizations");
+ 
+   addPass(&EarlyMachineLICMID);
+   addPass(&MachineCSEID);
+ 
+   addPass(&MachineSinkingID);
+   printAndVerify("After Machine LICM, CSE and Sinking passes");
+ 
+   addPass(&PeepholeOptimizerID);
+   printAndVerify("After codegen peephole optimization pass");
+ }
+diff --git a/test/CodeGen/NVPTX/calls-with-phi.ll b/test/CodeGen/NVPTX/calls-with-phi.ll
+new file mode 100644
+index 00000000000..6e010ea9adc
+--- /dev/null
++++ b/test/CodeGen/NVPTX/calls-with-phi.ll
+@@ -0,0 +1,22 @@
++; RUN: llc < %s -march=nvptx 2>&1 | FileCheck %s
++; Make sure the example doesn't crash with segfault
++
++; CHECK: .visible .func ({{.*}}) loop
++define i32 @loop(i32, i32) {
++entry:
++  br label %loop
++
++loop:
++  %i = phi i32 [ %0, %entry ], [ %res, %loop ]
++  %res = call i32 @div(i32 %i, i32 %1)
++
++  %exitcond = icmp eq i32 %res, %0
++  br i1 %exitcond, label %exit, label %loop
++
++exit:
++  ret i32 %res
++}
++
++define i32 @div(i32, i32) {
++  ret i32 0
++}
+diff --git a/test/CodeGen/NVPTX/libcall-fulfilled.ll b/test/CodeGen/NVPTX/libcall-fulfilled.ll
+new file mode 100644
+index 00000000000..dcd4482aef2
+--- /dev/null
++++ b/test/CodeGen/NVPTX/libcall-fulfilled.ll
+@@ -0,0 +1,27 @@
++; RUN: llc < %s -march=nvptx 2>&1 | FileCheck %s
++; Allow to make libcalls that are defined in current module
++
++define i128 @remainder(i128, i128) {
++bb0:
++  ; CHECK:      { // callseq 0, 0
++  ; CHECK:      call.uni (retval0),
++  ; CHECK-NEXT: __umodti3,
++  ; CHECK-NEXT: (
++  ; CHECK-NEXT: param0,
++  ; CHECK-NEXT: param1
++  ; CHECK-NEXT: );
++  ; CHECK-NEXT: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [retval0+0];
++  ; CHECK-NEXT: } // callseq 0
++  %a = urem i128 %0, %1
++  br label %bb1
++
++bb1:
++  ; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%[[REG0]], %[[REG1]]};
++  ; CHECK-NEXT: ret;
++  ret i128 %a
++}
++
++; Underlying libcall
++define i128 @__umodti3(i128, i128) {
++  ret i128 0
++}
+diff --git a/test/CodeGen/NVPTX/libcall-instruction.ll b/test/CodeGen/NVPTX/libcall-instruction.ll
+index 0c2cab7eaa5..cba6e9ca4c1 100644
+--- a/test/CodeGen/NVPTX/libcall-instruction.ll
++++ b/test/CodeGen/NVPTX/libcall-instruction.ll
+@@ -1,8 +1,8 @@
+ ; RUN: not llc < %s -march=nvptx 2>&1 | FileCheck %s
+-; used to panic on failed assetion and now fails with a "Cannot select"
++; used to panic on failed assetion and now fails with a "Undefined external symbol"
+ 
+-; CHECK: LLVM ERROR: Cannot select: {{t28|0x[0-9a-f]+}}: i32 = ExternalSymbol'__umodti3'
++; CHECK: LLVM ERROR: Undefined external symbol "__umodti3"
+ define hidden i128 @remainder(i128, i128) {
+   %3 = urem i128 %0, %1
+   ret i128 %3
+ }
+diff --git a/test/CodeGen/NVPTX/libcall-intrinsic.ll b/test/CodeGen/NVPTX/libcall-intrinsic.ll
+new file mode 100644
+index 00000000000..caad8273f57
+--- /dev/null
++++ b/test/CodeGen/NVPTX/libcall-intrinsic.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc < %s -march=nvptx 2>&1 | FileCheck %s
++; used to seqfault and now fails with a "Undefined external symbol"
++
++; CHECK: LLVM ERROR: Undefined external symbol "__powidf2"
++define double @powi(double, i32) {
++  %a = call double @llvm.powi.f64(double %0, i32 %1)
++  ret double %a
++}
++
++declare double @llvm.powi.f64(double, i32) nounwind readnone
+diff --git a/test/CodeGen/NVPTX/zero-cs.ll b/test/CodeGen/NVPTX/zero-cs.ll
+deleted file mode 100644
+index 7a7a99030d4..00000000000
+--- a/test/CodeGen/NVPTX/zero-cs.ll
++++ /dev/null
+@@ -1,10 +0,0 @@
+-; RUN: not llc < %s -march=nvptx 2>&1 | FileCheck %s
+-; used to seqfault and now fails with a "Cannot select"
+-
+-; CHECK: LLVM ERROR: Cannot select: {{t7|0x[0-9a-f]+}}: i32 = ExternalSymbol'__powidf2'
+-define double @powi() {
+-  %1 = call double @llvm.powi.f64(double 1.000000e+00, i32 undef)
+-  ret double %1
+-}
+-
+-declare double @llvm.powi.f64(double, i32) nounwind readnone
Index: test/CodeGen/NVPTX/calls-with-phi.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/calls-with-phi.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=nvptx 2>&1 | FileCheck %s
+; Make sure the example doesn't crash with segfault
+
+; CHECK: .visible .func ({{.*}}) loop
+define i32 @loop(i32, i32) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ %0, %entry ], [ %res, %loop ]
+  %res = call i32 @div(i32 %i, i32 %1)
+
+  %exitcond = icmp eq i32 %res, %0
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret i32 %res
+}
+
+define i32 @div(i32, i32) {
+  ret i32 0
+}
Index: test/CodeGen/NVPTX/libcall-fulfilled.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/libcall-fulfilled.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=nvptx 2>&1 | FileCheck %s
+; Allow to make libcalls that are defined in current module
+
+define i128 @remainder(i128, i128) {
+bb0:
+  ; CHECK:      { // callseq 0, 0
+  ; CHECK:      call.uni (retval0),
+  ; CHECK-NEXT: __umodti3,
+  ; CHECK-NEXT: (
+  ; CHECK-NEXT: param0,
+  ; CHECK-NEXT: param1
+  ; CHECK-NEXT: );
+  ; CHECK-NEXT: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [retval0+0];
+  ; CHECK-NEXT: } // callseq 0
+  %a = urem i128 %0, %1
+  br label %bb1
+
+bb1:
+  ; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%[[REG0]], %[[REG1]]};
+  ; CHECK-NEXT: ret;
+  ret i128 %a
+}
+
+; Underlying libcall
+define i128 @__umodti3(i128, i128) {
+  ret i128 0
+}
Index: test/CodeGen/NVPTX/libcall-instruction.ll
===================================================================
--- test/CodeGen/NVPTX/libcall-instruction.ll
+++ test/CodeGen/NVPTX/libcall-instruction.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc < %s -march=nvptx 2>&1 | FileCheck %s
-; used to panic on failed assetion and now fails with a "Cannot select"
+; used to panic on failed assetion and now fails with a "Undefined external symbol"
 
-; CHECK: LLVM ERROR: Cannot select: {{t28|0x[0-9a-f]+}}: i32 = ExternalSymbol'__umodti3'
+; CHECK: LLVM ERROR: Undefined external symbol "__umodti3"
 define hidden i128 @remainder(i128, i128) {
   %3 = urem i128 %0, %1
   ret i128 %3
Index: test/CodeGen/NVPTX/libcall-intrinsic.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/libcall-intrinsic.ll
@@ -0,0 +1,10 @@
+; RUN: not llc < %s -march=nvptx 2>&1 | FileCheck %s
+; used to seqfault and now fails with a "Undefined external symbol"
+
+; CHECK: LLVM ERROR: Undefined external symbol "__powidf2"
+define double @powi(double, i32) {
+  %a = call double @llvm.powi.f64(double %0, i32 %1)
+  ret double %a
+}
+
+declare double @llvm.powi.f64(double, i32) nounwind readnone
Index: test/CodeGen/NVPTX/zero-cs.ll
===================================================================
--- test/CodeGen/NVPTX/zero-cs.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not llc < %s -march=nvptx 2>&1 | FileCheck %s
-; used to seqfault and now fails with a "Cannot select"
-
-; CHECK: LLVM ERROR: Cannot select: {{t7|0x[0-9a-f]+}}: i32 = ExternalSymbol'__powidf2'
-define double @powi() {
-  %1 = call double @llvm.powi.f64(double 1.000000e+00, i32 undef)
-  ret double %1
-}
-
-declare double @llvm.powi.f64(double, i32) nounwind readnone