Index: cmake/modules/HandleLLVMOptions.cmake
===================================================================
--- cmake/modules/HandleLLVMOptions.cmake
+++ cmake/modules/HandleLLVMOptions.cmake
@@ -642,6 +642,9 @@
       append_common_sanitizer_flags()
       append("-fsanitize=address,undefined -fno-sanitize=vptr,function -fno-sanitize-recover=all"
               CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    elseif (LLVM_USE_SANITIZER STREQUAL "Leaks")
+      append_common_sanitizer_flags()
+      append("-fsanitize=leak" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     else()
       message(FATAL_ERROR "Unsupported value of LLVM_USE_SANITIZER: ${LLVM_USE_SANITIZER}")
     endif()
Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -475,6 +475,33 @@
 def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
 def int_amdgcn_buffer_store : AMDGPUBufferStore;
 
+def int_amdgcn_tbuffer_load : Intrinsic <
+    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
+    [llvm_v4i32_ty,   // rsrc(SGPR)
+     llvm_i32_ty,     // vindex(VGPR)
+     llvm_i32_ty,     // voffset(VGPR)
+     llvm_i32_ty,     // soffset(SGPR)
+     llvm_i32_ty,     // offset(imm)
+     llvm_i32_ty,     // dfmt(imm)
+     llvm_i32_ty,     // nfmt(imm)
+     llvm_i1_ty,     // glc(imm)
+     llvm_i1_ty],    // slc(imm)
+    []>;
+
+def int_amdgcn_tbuffer_store : Intrinsic <
+    [],
+    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
+     llvm_v4i32_ty,  // rsrc(SGPR)
+     llvm_i32_ty,    // vindex(VGPR)
+     llvm_i32_ty,    // voffset(VGPR)
+     llvm_i32_ty,    // soffset(SGPR)
+     llvm_i32_ty,    // offset(imm)
+     llvm_i32_ty,    // dfmt(imm)
+     llvm_i32_ty,    // nfmt(imm)
+     llvm_i1_ty,     // glc(imm)
+     llvm_i1_ty],    // slc(imm)
+    []>;
+
 class AMDGPUBufferAtomic : Intrinsic <
   [llvm_i32_ty],
   [llvm_i32_ty,       // vdata(VGPR)
Index: include/llvm/IR/Statepoint.h
===================================================================
--- include/llvm/IR/Statepoint.h
+++ include/llvm/IR/Statepoint.h
@@ -62,7 +62,10 @@
 bool isStatepoint(const Value &V);
 
 bool isGCRelocate(ImmutableCallSite CS);
+bool isGCRelocate(const Value *V);
+
 bool isGCResult(ImmutableCallSite CS);
+bool isGCResult(const Value *V);
 
 /// Analogous to CallSiteBase, this provides most of the actual
 /// functionality for Statepoint and ImmutableStatepoint.  It is
Index: include/llvm/Option/OptParser.td
===================================================================
--- include/llvm/Option/OptParser.td
+++ include/llvm/Option/OptParser.td
@@ -92,6 +92,7 @@
   int NumArgs = 0;
   string HelpText = ?;
   string MetaVarName = ?;
+  string Values = ?;
   list<OptionFlag> Flags = [];
   OptionGroup Group = ?;
   Option Alias = ?;
@@ -126,6 +127,7 @@
 class Group<OptionGroup group> { OptionGroup Group = group; }
 class HelpText<string text> { string HelpText = text; }
 class MetaVarName<string name> { string MetaVarName = name; }
+class Values<string value> { string Values = value; }
 
 // Predefined options.
 
Index: include/llvm/Option/OptTable.h
===================================================================
--- include/llvm/Option/OptTable.h
+++ include/llvm/Option/OptTable.h
@@ -53,6 +53,7 @@
     unsigned short GroupID;
     unsigned short AliasID;
     const char *AliasArgs;
+    const char *Values;
   };
 
 private:
@@ -120,6 +121,19 @@
     return getInfo(id).MetaVar;
   }
 
+  /// Find possible value for given flags. This is used for shell
+  /// autocompletion.
+  ///
+  /// \param [in] Option - Key flag like "-stdlib=" when "-stdlib=l"
+  /// was passed to clang.
+  ///
+  /// \param [in] Arg - Value which we want to autocomplete like "l"
+  /// when "-stdlib=l" was passed to clang.
+  ///
+  /// \return The vector of possible values.
+  std::vector<std::string> suggestValueCompletions(StringRef Option,
+                                                   StringRef Arg) const;
+
   /// Find flags from OptTable which starts with Cur.
   ///
   /// \param [in] Cur - String prefix that all returned flags need
Index: include/llvm/Option/Option.h
===================================================================
--- include/llvm/Option/Option.h
+++ include/llvm/Option/Option.h
@@ -57,6 +57,7 @@
     UnknownClass,
     FlagClass,
     JoinedClass,
+    ValuesClass,
     SeparateClass,
     RemainingArgsClass,
     RemainingArgsJoinedClass,
@@ -155,6 +156,7 @@
     case CommaJoinedClass:
       return RenderCommaJoinedStyle;
     case FlagClass:
+    case ValuesClass:
     case SeparateClass:
     case MultiArgClass:
     case JoinedOrSeparateClass:
Index: include/llvm/Support/Error.h
===================================================================
--- include/llvm/Support/Error.h
+++ include/llvm/Support/Error.h
@@ -1076,6 +1076,27 @@
     llvm_unreachable("Failure value returned from cantFail wrapped call");
 }
 
+/// Report a fatal error if ValOrErr is a failure value, otherwise unwraps and
+/// returns the contained reference.
+///
+/// This function can be used to wrap calls to fallible functions ONLY when it
+/// is known that the Error will always be a success value. E.g.
+///
+///   @code{.cpp}
+///   // foo only attempts the fallible operation if DoFallibleOperation is
+///   // true. If DoFallibleOperation is false then foo always returns a Bar&.
+///   Expected<Bar&> foo(bool DoFallibleOperation);
+///
+///   Bar &X = cantFail(foo(false));
+///   @endcode
+template <typename T>
+T& cantFail(Expected<T&> ValOrErr) {
+  if (ValOrErr)
+    return *ValOrErr;
+  else
+    llvm_unreachable("Failure value returned from cantFail wrapped call");
+}
+
 } // end namespace llvm
 
 #endif // LLVM_SUPPORT_ERROR_H
Index: include/llvm/Target/GenericOpcodes.td
===================================================================
--- include/llvm/Target/GenericOpcodes.td
+++ include/llvm/Target/GenericOpcodes.td
@@ -386,6 +386,15 @@
   let isCommutable = 1;
 }
 
+// Generic fused multiply-add instruction.
+// Behaves like llvm fma intrinsic ie src1 * src2 + src3
+def G_FMA : Instruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
+  let hasSideEffects = 0;
+  let isCommutable = 0;
+}
+
 // Generic FP division.
 def G_FDIV : Instruction {
   let OutOperandList = (outs type0:$dst);
Index: include/llvm/Target/TargetOpcodes.def
===================================================================
--- include/llvm/Target/TargetOpcodes.def
+++ include/llvm/Target/TargetOpcodes.def
@@ -359,6 +359,9 @@
 /// Generic FP multiplication.
 HANDLE_TARGET_OPCODE(G_FMUL)
 
+/// Generic FMA multiplication. Behaves like llvm fma intrinsic
+HANDLE_TARGET_OPCODE(G_FMA)
+
 /// Generic FP division.
 HANDLE_TARGET_OPCODE(G_FDIV)
 
Index: lib/Analysis/ScalarEvolution.cpp
===================================================================
--- lib/Analysis/ScalarEvolution.cpp
+++ lib/Analysis/ScalarEvolution.cpp
@@ -126,7 +126,7 @@
 static cl::opt<unsigned> MulOpsInlineThreshold(
     "scev-mulops-inline-threshold", cl::Hidden,
     cl::desc("Threshold for inlining multiplication operands into a SCEV"),
-    cl::init(1000));
+    cl::init(32));
 
 static cl::opt<unsigned> AddOpsInlineThreshold(
     "scev-addops-inline-threshold", cl::Hidden,
Index: lib/CodeGen/AsmPrinter/AsmPrinter.cpp
===================================================================
--- lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2801,26 +2801,24 @@
   }
 
   // Before we switch over, we force a reference to a label inside the
-  // xray_instr_map and xray_fn_idx sections. Since this function is always
-  // called just before the function's end, we assume that this is happening
-  // after the last return instruction. We also use the synthetic label in the
-  // xray_inster_map as a delimeter for the range of sleds for this function in
-  // the index.
+  // xray_fn_idx sections. This makes sure that the xray_fn_idx section is kept
+  // live by the linker if the function is not garbage-collected. Since this
+  // function is always called just before the function's end, we assume that
+  // this is happening after the last return instruction.
   auto WordSizeBytes = MAI->getCodePointerSize();
-  MCSymbol *SledsStart = OutContext.createTempSymbol("xray_synthetic_", true);
   MCSymbol *IdxRef = OutContext.createTempSymbol("xray_fn_idx_synth_", true);
   OutStreamer->EmitCodeAlignment(16);
-  OutStreamer->EmitSymbolValue(SledsStart, WordSizeBytes, false);
   OutStreamer->EmitSymbolValue(IdxRef, WordSizeBytes, false);
 
   // Now we switch to the instrumentation map section. Because this is done
   // per-function, we are able to create an index entry that will represent the
   // range of sleds associated with a function.
+  MCSymbol *SledsStart = OutContext.createTempSymbol("xray_sleds_start", true);
   OutStreamer->SwitchSection(InstMap);
   OutStreamer->EmitLabel(SledsStart);
   for (const auto &Sled : Sleds)
     Sled.emit(WordSizeBytes, OutStreamer.get(), CurrentFnSym);
-  MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_synthetic_end", true);
+  MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_sleds_end", true);
   OutStreamer->EmitLabel(SledsEnd);
 
   // We then emit a single entry in the index per function. We use the symbols
Index: lib/CodeGen/AsmPrinter/DwarfDebug.cpp
===================================================================
--- lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1000,12 +1000,14 @@
     if (Pred->getFlag(MachineInstr::FrameSetup))
       break;
     auto PredDL = Pred->getDebugLoc();
-    if (!PredDL || Pred->isDebugValue())
+    if (!PredDL || Pred->isMetaInstruction())
       continue;
     // Check whether the instruction preceding the DBG_VALUE is in the same
     // (sub)scope as the DBG_VALUE.
-    if (DL->getScope() == PredDL->getScope() ||
-        LScope->dominates(LScopes.findLexicalScope(PredDL)))
+    if (DL->getScope() == PredDL->getScope())
+      return false;
+    auto *PredScope = LScopes.findLexicalScope(PredDL);
+    if (!PredScope || LScope->dominates(PredScope))
       return false;
   }
 
Index: lib/CodeGen/GlobalISel/IRTranslator.cpp
===================================================================
--- lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -686,6 +686,13 @@
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
         .addUse(getOrCreateVReg(*CI.getArgOperand(1)));
     return true;
+  case Intrinsic::fma:
+    MIRBuilder.buildInstr(TargetOpcode::G_FMA)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(1)))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(2)));
+    return true;
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
   case Intrinsic::memset:
Index: lib/CodeGen/ImplicitNullChecks.cpp
===================================================================
--- lib/CodeGen/ImplicitNullChecks.cpp
+++ lib/CodeGen/ImplicitNullChecks.cpp
@@ -359,30 +359,15 @@
         Offset < PageSize))
     return SR_Unsuitable;
 
-  // Finally, we need to make sure that the access instruction actually is
-  // accessing from PointerReg, and there isn't some re-definition of PointerReg
-  // between the compare and the memory access.
-  // If PointerReg has been redefined before then there is no sense to continue
-  // lookup due to this condition will fail for any further instruction.
-  SuitabilityResult Suitable = SR_Suitable;
-  for (auto *PrevMI : PrevInsts)
-    for (auto &PrevMO : PrevMI->operands()) {
-      if (PrevMO.isReg() && PrevMO.getReg() && PrevMO.isDef() &&
-          TRI->regsOverlap(PrevMO.getReg(), PointerReg))
-        return SR_Impossible;
-
-      // Check whether the current memory access aliases with previous one.
-      // If we already found that it aliases then no need to continue.
-      // But we continue base pointer check as it can result in SR_Impossible.
-      if (Suitable == SR_Suitable) {
-        AliasResult AR = areMemoryOpsAliased(MI, PrevMI);
-        if (AR == AR_WillAliasEverything)
-          return SR_Impossible;
-        if (AR == AR_MayAlias)
-          Suitable = SR_Unsuitable;
-      }
-    }
-  return Suitable;
+  // Finally, check whether the current memory access aliases with previous one.
+  for (auto *PrevMI : PrevInsts) {
+    AliasResult AR = areMemoryOpsAliased(MI, PrevMI);
+    if (AR == AR_WillAliasEverything)
+      return SR_Impossible;
+    if (AR == AR_MayAlias)
+      return SR_Unsuitable;
+  }
+  return SR_Suitable;
 }
 
 bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI,
@@ -569,6 +554,12 @@
       return true;
     }
 
+    // If MI re-defines the PointerReg then we cannot move further.
+    if (any_of(MI.operands(), [&](MachineOperand &MO) {
+          return MO.isReg() && MO.getReg() && MO.isDef() &&
+                 TRI->regsOverlap(MO.getReg(), PointerReg);
+        }))
+      return false;
     InstsSeenSoFar.push_back(&MI);
   }
 
Index: lib/CodeGen/RegisterScavenging.cpp
===================================================================
--- lib/CodeGen/RegisterScavenging.cpp
+++ lib/CodeGen/RegisterScavenging.cpp
@@ -372,60 +372,62 @@
 /// clobbered for the longest time.
 /// Returns the register and the earliest position we know it to be free or
 /// the position MBB.end() if no register is available.
-static std::pair<unsigned, MachineBasicBlock::iterator>
-findSurvivorBackwards(const TargetRegisterInfo &TRI,
+static std::pair<MCPhysReg, MachineBasicBlock::iterator>
+findSurvivorBackwards(const MachineRegisterInfo &MRI,
     MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
-    BitVector &Available, BitVector &Candidates) {
+    const LiveRegUnits &LiveOut, ArrayRef<MCPhysReg> AllocationOrder) {
   bool FoundTo = false;
-  unsigned Survivor = 0;
+  MCPhysReg Survivor = 0;
   MachineBasicBlock::iterator Pos;
   MachineBasicBlock &MBB = *From->getParent();
   unsigned InstrLimit = 25;
   unsigned InstrCountDown = InstrLimit;
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  LiveRegUnits Used(TRI);
+
   for (MachineBasicBlock::iterator I = From;; --I) {
     const MachineInstr &MI = *I;
 
-    // Remove any candidates touched by instruction.
-    bool FoundVReg = false;
-    for (const MachineOperand &MO : MI.operands()) {
-      if (MO.isRegMask()) {
-        Candidates.clearBitsNotInMask(MO.getRegMask());
-        continue;
-      }
-      if (!MO.isReg() || MO.isUndef() || MO.isDebug())
-        continue;
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-        FoundVReg = true;
-      } else if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
-          Candidates.reset(*AI);
-      }
-    }
+    Used.accumulateBackward(MI);
 
     if (I == To) {
-      // If one of the available registers survived this long take it.
-      Available &= Candidates;
-      int Reg = Available.find_first();
-      if (Reg != -1)
-        return std::make_pair(Reg, MBB.end());
+      // See if one of the registers in RC wasn't used so far.
+      for (MCPhysReg Reg : AllocationOrder) {
+        if (!MRI.isReserved(Reg) && Used.available(Reg) &&
+            LiveOut.available(Reg))
+          return std::make_pair(Reg, MBB.end());
+      }
       // Otherwise we will continue up to InstrLimit instructions to find
       // the register which is not defined/used for the longest time.
       FoundTo = true;
       Pos = To;
     }
     if (FoundTo) {
-      if (Survivor == 0 || !Candidates.test(Survivor)) {
-        int Reg = Candidates.find_first();
-        if (Reg == -1)
+      if (Survivor == 0 || !Used.available(Survivor)) {
+        MCPhysReg AvilableReg = 0;
+        for (MCPhysReg Reg : AllocationOrder) {
+          if (!MRI.isReserved(Reg) && Used.available(Reg)) {
+            AvilableReg = Reg;
+            break;
+          }
+        }
+        if (AvilableReg == 0)
           break;
-        Survivor = Reg;
+        Survivor = AvilableReg;
       }
       if (--InstrCountDown == 0)
         break;
+
+      // Keep searching when we find a vreg since the spilled register will
+      // be usefull for this other vreg as well later.
+      bool FoundVReg = false;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+          FoundVReg = true;
+          break;
+        }
+      }
       if (FoundVReg) {
-        // Keep searching when we find a vreg since the spilled register will
-        // be usefull for this other vreg as well later.
         InstrCountDown = InstrLimit;
         Pos = I;
       }
@@ -568,18 +570,13 @@
                                                  bool RestoreAfter, int SPAdj) {
   const MachineBasicBlock &MBB = *To->getParent();
   const MachineFunction &MF = *MBB.getParent();
-  // Consider all allocatable registers in the register class initially
-  BitVector Candidates = TRI->getAllocatableSet(MF, &RC);
-
-  // Try to find a register that's unused if there is one, as then we won't
-  // have to spill.
-  BitVector Available = getRegsAvailable(&RC);
 
   // Find the register whose use is furthest away.
   MachineBasicBlock::iterator UseMI;
-  std::pair<unsigned, MachineBasicBlock::iterator> P =
-      findSurvivorBackwards(*TRI, MBBI, To, Available, Candidates);
-  unsigned Reg = P.first;
+  ArrayRef<MCPhysReg> AllocationOrder = RC.getRawAllocationOrder(MF);
+  std::pair<MCPhysReg, MachineBasicBlock::iterator> P =
+      findSurvivorBackwards(*MRI, MBBI, To, LiveUnits, AllocationOrder);
+  MCPhysReg Reg = P.first;
   MachineBasicBlock::iterator SpillBefore = P.second;
   assert(Reg != 0 && "No register left to scavenge!");
   // Found an available register?
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14055,6 +14055,11 @@
         // when we start sorting the vectors by type.
         return SDValue();
       }
+    } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() &&
+               InVT1.getSizeInBits() == VT.getSizeInBits()) {
+      SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
+      ConcatOps[0] = VecIn2;
+      VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
     } else {
       // TODO: Support cases where the length mismatch isn't exactly by a
       // factor of 2.
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3530,17 +3530,24 @@
         LC = RTLIB::MUL_I128;
       assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
 
-      // The high part is obtained by SRA'ing all but one of the bits of low
-      // part.
-      unsigned LoSize = VT.getSizeInBits();
-      SDValue HiLHS =
-          DAG.getNode(ISD::SRA, dl, VT, LHS,
-                      DAG.getConstant(LoSize - 1, dl,
-                                      TLI.getPointerTy(DAG.getDataLayout())));
-      SDValue HiRHS =
-          DAG.getNode(ISD::SRA, dl, VT, RHS,
-                      DAG.getConstant(LoSize - 1, dl,
-                                      TLI.getPointerTy(DAG.getDataLayout())));
+      SDValue HiLHS;
+      SDValue HiRHS;
+      if (isSigned) {
+        // The high part is obtained by SRA'ing all but one of the bits of low
+        // part.
+        unsigned LoSize = VT.getSizeInBits();
+        HiLHS =
+            DAG.getNode(ISD::SRA, dl, VT, LHS,
+                        DAG.getConstant(LoSize - 1, dl,
+                                        TLI.getPointerTy(DAG.getDataLayout())));
+        HiRHS =
+            DAG.getNode(ISD::SRA, dl, VT, RHS,
+                        DAG.getConstant(LoSize - 1, dl,
+                                        TLI.getPointerTy(DAG.getDataLayout())));
+      } else {
+          HiLHS = DAG.getConstant(0, dl, VT);
+          HiRHS = DAG.getConstant(0, dl, VT);
+      }
 
       // Here we're passing the 2 arguments explicitly as 4 arguments that are
       // pre-lowered to the correct types. This all depends upon WideVT not
Index: lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
===================================================================
--- lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
+++ lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -51,6 +51,7 @@
                                                        uint32_t ModIndex,
                                                        msf::MSFBuilder &Msf)
     : MSF(Msf), ModuleName(ModuleName) {
+  ::memset(&Layout, 0, sizeof(Layout));
   Layout.Mod = ModIndex;
 }
 
@@ -102,6 +103,7 @@
 template <typename T> Foo<T> makeFoo(T &&t) { return Foo<T>(std::move(t)); }
 
 void DbiModuleDescriptorBuilder::finalize() {
+  Layout.SC.ModuleIndex = Layout.Mod;
   Layout.FileNameOffs = 0; // TODO: Fix this
   Layout.Flags = 0;        // TODO: Fix this
   Layout.C11Bytes = 0;
Index: lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
===================================================================
--- lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -237,6 +237,7 @@
     return EC;
 
   DbiStreamHeader *H = Allocator.Allocate<DbiStreamHeader>();
+  ::memset(H, 0, sizeof(DbiStreamHeader));
   H->VersionHeader = *VerHeader;
   H->VersionSignature = -1;
   H->Age = Age;
Index: lib/IR/Statepoint.cpp
===================================================================
--- lib/IR/Statepoint.cpp
+++ lib/IR/Statepoint.cpp
@@ -44,10 +44,22 @@
   return CS.getInstruction() && isa<GCRelocateInst>(CS.getInstruction());
 }
 
+bool llvm::isGCRelocate(const Value *V) {
+  if (auto CS = ImmutableCallSite(V))
+    return isGCRelocate(CS);
+  return false;
+}
+
 bool llvm::isGCResult(ImmutableCallSite CS) {
   return CS.getInstruction() && isa<GCResultInst>(CS.getInstruction());
 }
 
+bool llvm::isGCResult(const Value *V) {
+  if (auto CS = ImmutableCallSite(V))
+    return isGCResult(CS);
+  return false;
+}
+
 bool llvm::isStatepointDirectiveAttr(Attribute Attr) {
   return Attr.hasAttribute("statepoint-id") ||
          Attr.hasAttribute("statepoint-num-patch-bytes");
Index: lib/ObjectYAML/CodeViewYAMLSymbols.cpp
===================================================================
--- lib/ObjectYAML/CodeViewYAMLSymbols.cpp
+++ lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -286,16 +286,15 @@
 }
 
 template <> void SymbolRecordImpl<ProcSym>::map(IO &IO) {
-  // TODO: Print the linkage name
-
-  IO.mapRequired("PtrParent", Symbol.Parent);
-  IO.mapRequired("PtrEnd", Symbol.End);
-  IO.mapRequired("PtrNext", Symbol.Next);
+  IO.mapOptional("PtrParent", Symbol.Parent, 0U);
+  IO.mapOptional("PtrEnd", Symbol.End, 0U);
+  IO.mapOptional("PtrNext", Symbol.Next, 0U);
   IO.mapRequired("CodeSize", Symbol.CodeSize);
   IO.mapRequired("DbgStart", Symbol.DbgStart);
   IO.mapRequired("DbgEnd", Symbol.DbgEnd);
   IO.mapRequired("FunctionType", Symbol.FunctionType);
-  IO.mapRequired("Segment", Symbol.Segment);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("Flags", Symbol.Flags);
   IO.mapRequired("DisplayName", Symbol.Name);
 }
@@ -308,8 +307,8 @@
 
 template <> void SymbolRecordImpl<PublicSym32>::map(IO &IO) {
   IO.mapRequired("Flags", Symbol.Flags);
-  IO.mapRequired("Seg", Symbol.Segment);
-  IO.mapRequired("Off", Symbol.Offset);
+  IO.mapOptional("Offset", Symbol.Offset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("Name", Symbol.Name);
 }
 
@@ -325,8 +324,8 @@
 }
 
 template <> void SymbolRecordImpl<InlineSiteSym>::map(IO &IO) {
-  IO.mapRequired("PtrParent", Symbol.Parent);
-  IO.mapRequired("PtrEnd", Symbol.End);
+  IO.mapOptional("PtrParent", Symbol.Parent, 0U);
+  IO.mapOptional("PtrEnd", Symbol.End, 0U);
   IO.mapRequired("Inlinee", Symbol.Inlinee);
   // TODO: The binary annotations
 }
@@ -368,17 +367,17 @@
 }
 
 template <> void SymbolRecordImpl<BlockSym>::map(IO &IO) {
-  // TODO: Print the linkage name
-  IO.mapRequired("PtrParent", Symbol.Parent);
-  IO.mapRequired("PtrEnd", Symbol.End);
+  IO.mapOptional("PtrParent", Symbol.Parent, 0U);
+  IO.mapOptional("PtrEnd", Symbol.End, 0U);
   IO.mapRequired("CodeSize", Symbol.CodeSize);
-  IO.mapRequired("Segment", Symbol.Segment);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("BlockName", Symbol.Name);
 }
 
 template <> void SymbolRecordImpl<LabelSym>::map(IO &IO) {
-  // TODO: Print the linkage name
-  IO.mapRequired("Segment", Symbol.Segment);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("Flags", Symbol.Flags);
   IO.mapRequired("Flags", Symbol.Flags);
   IO.mapRequired("DisplayName", Symbol.Name);
@@ -428,8 +427,8 @@
 }
 
 template <> void SymbolRecordImpl<CallSiteInfoSym>::map(IO &IO) {
-  // TODO: Map Linkage Name
-  IO.mapRequired("Segment", Symbol.Segment);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("Type", Symbol.Type);
 }
 
@@ -441,14 +440,13 @@
 }
 
 template <> void SymbolRecordImpl<HeapAllocationSiteSym>::map(IO &IO) {
-  // TODO: Map Linkage Name
-  IO.mapRequired("Segment", Symbol.Segment);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("CallInstructionSize", Symbol.CallInstructionSize);
   IO.mapRequired("Type", Symbol.Type);
 }
 
 template <> void SymbolRecordImpl<FrameCookieSym>::map(IO &IO) {
-  // TODO: Map Linkage Name
   IO.mapRequired("Register", Symbol.Register);
   IO.mapRequired("CookieKind", Symbol.CookieKind);
   IO.mapRequired("Flags", Symbol.Flags);
@@ -487,14 +485,16 @@
 }
 
 template <> void SymbolRecordImpl<DataSym>::map(IO &IO) {
-  // TODO: Map linkage name
   IO.mapRequired("Type", Symbol.Type);
+  IO.mapOptional("Offset", Symbol.DataOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("DisplayName", Symbol.Name);
 }
 
 template <> void SymbolRecordImpl<ThreadLocalDataSym>::map(IO &IO) {
-  // TODO: Map linkage name
   IO.mapRequired("Type", Symbol.Type);
+  IO.mapOptional("Offset", Symbol.DataOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
   IO.mapRequired("DisplayName", Symbol.Name);
 }
 }
Index: lib/Option/OptTable.cpp
===================================================================
--- lib/Option/OptTable.cpp
+++ lib/Option/OptTable.cpp
@@ -194,6 +194,37 @@
   return 0;
 }
 
+// Returns true if one of the Prefixes + In.Names matches Option
+static bool optionMatches(const OptTable::Info &In, StringRef Option) {
+  if (In.Values && In.Prefixes)
+    for (size_t I = 0; In.Prefixes[I]; I++)
+      if (Option == std::string(In.Prefixes[I]) + In.Name)
+        return true;
+  return false;
+}
+
+// This function is for flag value completion.
+// Eg. When "-stdlib=" and "l" was passed to this function, it will return
+// appropiriate values for stdlib, which starts with l.
+std::vector<std::string>
+OptTable::suggestValueCompletions(StringRef Option, StringRef Arg) const {
+  // Search all options and return possible values.
+  for (const Info &In : OptionInfos.slice(FirstSearchableIndex)) {
+    if (!optionMatches(In, Option))
+      continue;
+
+    SmallVector<StringRef, 8> Candidates;
+    StringRef(In.Values).split(Candidates, ",", -1, false);
+
+    std::vector<std::string> Result;
+    for (StringRef Val : Candidates)
+      if (Val.startswith(Arg))
+        Result.push_back(Val);
+    return Result;
+  }
+  return {};
+}
+
 std::vector<std::string> OptTable::findByPrefix(StringRef Cur) const {
   std::vector<std::string> Ret;
   for (const Info &In : OptionInfos.slice(FirstSearchableIndex)) {
@@ -336,6 +367,9 @@
   case Option::FlagClass:
     break;
 
+  case Option::ValuesClass:
+    break;
+
   case Option::SeparateClass: case Option::JoinedOrSeparateClass:
   case Option::RemainingArgsClass: case Option::RemainingArgsJoinedClass:
     Name += ' ';
Index: lib/Option/Option.cpp
===================================================================
--- lib/Option/Option.cpp
+++ lib/Option/Option.cpp
@@ -47,6 +47,7 @@
     P(UnknownClass);
     P(FlagClass);
     P(JoinedClass);
+    P(ValuesClass);
     P(SeparateClass);
     P(CommaJoinedClass);
     P(MultiArgClass);
Index: lib/Support/Unix/Program.inc
===================================================================
--- lib/Support/Unix/Program.inc
+++ lib/Support/Unix/Program.inc
@@ -449,11 +449,22 @@
   size_t ArgLength = Program.size() + 1;
   for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end();
        I != E; ++I) {
-    ArgLength += strlen(*I) + 1;
+    size_t length = strlen(*I);
+
+    // Ensure that we do not exceed the MAX_ARG_STRLEN constant on Linux, which
+    // does not have a constant unlike what the man pages would have you
+    // believe. Since this limit is pretty high, perform the check
+    // unconditionally rather than trying to be aggressive and limiting it to
+    // Linux only.
+    if (length >= (32 * 4096))
+      return false;
+
+    ArgLength += length + 1;
     if (ArgLength > size_t(HalfArgMax)) {
       return false;
     }
   }
+
   return true;
 }
 }
Index: lib/Support/raw_ostream.cpp
===================================================================
--- lib/Support/raw_ostream.cpp
+++ lib/Support/raw_ostream.cpp
@@ -548,7 +548,11 @@
   pos += Size;
 
 #ifndef LLVM_ON_WIN32
+#if defined(__linux__)
+  bool ShouldWriteInChunks = true;
+#else
   bool ShouldWriteInChunks = false;
+#endif
 #else
   // Writing a large size of output to Windows console returns ENOMEM. It seems
   // that, prior to Windows 8, WriteFile() is redirecting to WriteConsole(), and
Index: lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
===================================================================
--- lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -43,26 +43,25 @@
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
-      // This table *must* be in the order that the fixup_* kinds are defined in
-      // AArch64FixupKinds.h.
-      //
-      // Name                           Offset (bits) Size (bits)     Flags
-      { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_aarch64_add_imm12", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
-      { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
-      { "fixup_aarch64_movw", 5, 16, 0 },
-      { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
-      { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
-    };
+        // This table *must* be in the order that the fixup_* kinds are defined
+        // in AArch64FixupKinds.h.
+        //
+        // Name                           Offset (bits) Size (bits)     Flags
+        {"fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal},
+        {"fixup_aarch64_add_imm12", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale1", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale2", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale4", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale8", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale16", 10, 12, 0},
+        {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal},
+        {"fixup_aarch64_movw", 5, 16, 0},
+        {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal},
+        {"fixup_aarch64_tlsdesc_call", 0, 0, 0}};
 
     if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
Index: lib/Target/AMDGPU/AMDGPU.td
===================================================================
--- lib/Target/AMDGPU/AMDGPU.td
+++ lib/Target/AMDGPU/AMDGPU.td
@@ -644,7 +644,11 @@
   "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
   AssemblerPredicate<"FeatureCIInsts">;
 
-def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
+  AssemblerPredicate<"FeatureFlatAddressSpace">;
+
+def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
+  AssemblerPredicate<"FeatureFlatGlobalInsts">;
 
 def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
   AssemblerPredicate<"Feature16BitInsts">;
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -403,6 +403,8 @@
   STORE_MSKOR,
   LOAD_CONSTANT,
   TBUFFER_STORE_FORMAT,
+  TBUFFER_STORE_FORMAT_X3,
+  TBUFFER_LOAD_FORMAT,
   ATOMIC_CMP_SWAP,
   ATOMIC_INC,
   ATOMIC_DEC,
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3664,6 +3664,8 @@
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
   NODE_NAME_CASE(ATOMIC_INC)
   NODE_NAME_CASE(ATOMIC_DEC)
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -75,8 +75,10 @@
     return TTI::PSK_FastHardware;
   }
 
-  unsigned getNumberOfRegisters(bool Vector);
-  unsigned getRegisterBitWidth(bool Vector) const;
+  unsigned getHardwareNumberOfRegisters(bool Vector) const;
+  unsigned getNumberOfRegisters(bool Vector) const;
+  unsigned getRegisterBitWidth(bool Vector) const ;
+  unsigned getMinVectorRegisterBitWidth() const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
 
   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -184,9 +184,9 @@
   }
 }
 
-unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
-  if (Vec)
-    return 0;
+unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+  // The concept of vector registers doesn't really exist. Some packed vector
+  // operations operate on the normal 32-bit registers.
 
   // Number of VGPRs on SI.
   if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
@@ -195,8 +195,18 @@
   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }
 
+unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
+  // This is really the number of registers to fill when vectorizing /
+  // interleaving loops, so we lie to avoid trying to use all registers.
+  return getHardwareNumberOfRegisters(Vec) >> 3;
+}
+
 unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
-  return Vector ? 0 : 32;
+  return 32;
+}
+
+unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
+  return 32;
 }
 
 unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
@@ -247,11 +257,11 @@
 
 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Disable unrolling if the loop is not vectorized.
+  // TODO: Enable this again.
   if (VF == 1)
     return 1;
 
-  // Semi-arbitrary large amount.
-  return 64;
+  return 8;
 }
 
 int AMDGPUTTIImpl::getArithmeticInstrCost(
Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
===================================================================
--- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -152,6 +152,8 @@
     ImmTyExpTgt,
     ImmTyExpCompr,
     ImmTyExpVM,
+    ImmTyDFMT,
+    ImmTyNFMT,
     ImmTyHwreg,
     ImmTyOff,
     ImmTySendMsg,
@@ -292,6 +294,8 @@
   bool isGLC() const { return isImmTy(ImmTyGLC); }
   bool isSLC() const { return isImmTy(ImmTySLC); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
+  bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
+  bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
   bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
   bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
   bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -636,6 +640,8 @@
     case ImmTyGLC: OS << "GLC"; break;
     case ImmTySLC: OS << "SLC"; break;
     case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyDFMT: OS << "DFMT"; break;
+    case ImmTyNFMT: OS << "NFMT"; break;
     case ImmTyClampSI: OS << "ClampSI"; break;
     case ImmTyOModSI: OS << "OModSI"; break;
     case ImmTyDppCtrl: OS << "DppCtrl"; break;
@@ -1029,6 +1035,8 @@
   void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
   void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
   void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+  void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
+
   AMDGPUOperand::Ptr defaultGLC() const;
   AMDGPUOperand::Ptr defaultSLC() const;
   AMDGPUOperand::Ptr defaultTFE() const;
@@ -1042,6 +1050,7 @@
   AMDGPUOperand::Ptr defaultSMRDOffset20() const;
   AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
   AMDGPUOperand::Ptr defaultOffsetU12() const;
+  AMDGPUOperand::Ptr defaultOffsetS13() const;
 
   OperandMatchResultTy parseOModOperand(OperandVector &Operands);
 
@@ -2554,11 +2563,21 @@
         return MatchOperand_ParseFail;
 
       Parser.Lex();
+
+      bool IsMinus = false;
+      if (getLexer().getKind() == AsmToken::Minus) {
+        Parser.Lex();
+        IsMinus = true;
+      }
+
       if (getLexer().isNot(AsmToken::Integer))
         return MatchOperand_ParseFail;
 
       if (getParser().parseAbsoluteExpression(Int))
         return MatchOperand_ParseFail;
+
+      if (IsMinus)
+        Int = -Int;
       break;
     }
   }
@@ -3743,6 +3762,44 @@
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
 }
 
+void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle the case where soffset is an immediate
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle tokens like 'offen' which are sometimes hard-coded into the
+    // asm string.  There are no MCInst operands for these.
+    if (Op.isToken()) {
+      continue;
+    }
+    assert(Op.isImm());
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                        AMDGPUOperand::ImmTyOffset);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+}
+
 //===----------------------------------------------------------------------===//
 // mimg
 //===----------------------------------------------------------------------===//
@@ -3870,6 +3927,10 @@
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
 }
 
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
 //===----------------------------------------------------------------------===//
 // vop3
 //===----------------------------------------------------------------------===//
@@ -3919,6 +3980,8 @@
   {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
   {"gds",     AMDGPUOperand::ImmTyGDS, true, nullptr},
   {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
+  {"dfmt",    AMDGPUOperand::ImmTyDFMT, false, nullptr},
+  {"nfmt",    AMDGPUOperand::ImmTyNFMT, false, nullptr},
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
   {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
Index: lib/Target/AMDGPU/BUFInstructions.td
===================================================================
--- lib/Target/AMDGPU/BUFInstructions.td
+++ lib/Target/AMDGPU/BUFInstructions.td
@@ -57,6 +57,11 @@
   string OpName = NAME # suffix;
 }
 
+class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+  bit IsAddr64 = is_addr64;
+  string OpName = NAME # suffix;
+}
+
 //===----------------------------------------------------------------------===//
 // MTBUF classes
 //===----------------------------------------------------------------------===//
@@ -78,14 +83,31 @@
   let EXP_CNT = 1;
   let MTBUF = 1;
   let Uses = [EXEC];
-
   let hasSideEffects = 0;
   let SchedRW = [WriteVMEM];
+
+  let AsmMatchConverter = "cvtMtbuf";
+
+  bits<1> offen       = 0;
+  bits<1> idxen       = 0;
+  bits<1> addr64      = 0;
+  bits<1> has_vdata   = 1;
+  bits<1> has_vaddr   = 1;
+  bits<1> has_glc     = 1;
+  bits<1> glc_value   = 0; // the value for glc if no such operand
+  bits<4> dfmt_value  = 1; // the value for dfmt if no such operand
+  bits<3> nfmt_value  = 0; // the value for nfmt if no such operand
+  bits<1> has_srsrc   = 1;
+  bits<1> has_soffset = 1;
+  bits<1> has_offset  = 1;
+  bits<1> has_slc     = 1;
+  bits<1> has_tfe     = 1;
+  bits<1> has_dfmt    = 1;
+  bits<1> has_nfmt    = 1;
 }
 
 class MTBUF_Real <MTBUF_Pseudo ps> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
-  Enc64 {
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -97,57 +119,168 @@
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
 
-  bits<8> vdata;
   bits<12> offset;
-  bits<1> offen;
-  bits<1> idxen;
-  bits<1> glc;
-  bits<1> addr64;
-  bits<4> dfmt;
-  bits<3> nfmt;
-  bits<8> vaddr;
-  bits<7> srsrc;
-  bits<1> slc;
-  bits<1> tfe;
-  bits<8> soffset;
-
-  let Inst{11-0}  = offset;
-  let Inst{12}    = offen;
-  let Inst{13}    = idxen;
-  let Inst{14}    = glc;
-  let Inst{22-19} = dfmt;
-  let Inst{25-23} = nfmt;
-  let Inst{31-26} = 0x3a; //encoding
-  let Inst{39-32} = vaddr;
-  let Inst{47-40} = vdata;
-  let Inst{52-48} = srsrc{6-2};
-  let Inst{54}    = slc;
-  let Inst{55}    = tfe;
-  let Inst{63-56} = soffset;
+  bits<1>  glc;
+  bits<4>  dfmt;
+  bits<3>  nfmt;
+  bits<8>  vaddr;
+  bits<8>  vdata;
+  bits<7>  srsrc;
+  bits<1>  slc;
+  bits<1>  tfe;
+  bits<8>  soffset;
+}
+
+class getMTBUFInsDA<list<RegisterClass> vdataList,
+                    list<RegisterClass> vaddrList=[]> {
+  RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+  RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  dag InsNoData = !if(!empty(vaddrList),
+    (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+    (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+  );
+  dag InsData = !if(!empty(vaddrList),
+    (ins vdataClass:$vdata,                    SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         slc:$slc, tfe:$tfe),
+    (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         slc:$slc, tfe:$tfe)
+  );
+  dag ret = !if(!empty(vdataList), InsNoData, InsData);
 }
 
-class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
-  opName, (outs regClass:$dst),
-  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
-       i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
-  " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
-  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+  dag ret =
+    !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    (ins))))));
+}
+
+class getMTBUFAsmOps<int addrKind> {
+  string Pfx =
+    !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset",
+    !if(!eq(addrKind, BUFAddrKind.OffEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen",
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen",
+    !if(!eq(addrKind, BUFAddrKind.BothEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen",
+    !if(!eq(addrKind, BUFAddrKind.Addr64),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64",
+    "")))));
+  string ret = Pfx # "$offset";
+}
+
+class MTBUF_SetupAddr<int addrKind> {
+  bits<1> offen  = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> idxen  = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+
+  bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+}
+
+class MTBUF_Load_Pseudo <string opName,
+                         int addrKind,
+                         RegisterClass vdataClass,
+                         list<dag> pattern=[],
+                         // Workaround bug bz30254
+                         int addrKindCopy = addrKind>
+  : MTBUF_Pseudo<opName,
+                 (outs vdataClass:$vdata),
+                 getMTBUFIns<addrKindCopy>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 1;
   let mayStore = 0;
 }
 
-class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
-  opName, (outs),
-  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
-       i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
-       SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
-  " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
-  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+                              ValueType load_vt = i32,
+                              SDPatternOperator ld = null_frag> {
+                         
+  def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
+                      i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
+                      i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+class MTBUF_Store_Pseudo <string opName,
+                          int addrKind,
+                          RegisterClass vdataClass,
+                          list<dag> pattern=[],
+                          // Workaround bug bz30254
+                          int addrKindCopy = addrKind,
+                          RegisterClass vdataClassCopy = vdataClass>
+  : MTBUF_Pseudo<opName,
+                 (outs),
+                 getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 0;
   let mayStore = 1;
 }
 
+multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+                               ValueType store_vt = i32,
+                               SDPatternOperator st = null_frag> {
+
+  def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+
 //===----------------------------------------------------------------------===//
 // MUBUF classes
 //===----------------------------------------------------------------------===//
@@ -676,14 +809,14 @@
 // MTBUF Instructions
 //===----------------------------------------------------------------------===//
 
-//def TBUFFER_LOAD_FORMAT_X    : MTBUF_ <0, "tbuffer_load_format_x", []>;
-//def TBUFFER_LOAD_FORMAT_XY   : MTBUF_ <1, "tbuffer_load_format_xy", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ  : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
-def TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Load_Pseudo  <"tbuffer_load_format_xyzw", VReg_128>;
-def TBUFFER_STORE_FORMAT_X    : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
-def TBUFFER_STORE_FORMAT_XY   : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
-def TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
-def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_x",     VGPR_32>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xy",    VReg_64>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyz",   VReg_128>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyzw",  VReg_128>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_x",    VGPR_32>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",   VReg_64>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
 
 } // End let SubtargetPredicate = isGCN
 
@@ -1093,22 +1226,98 @@
 // MTBUF Patterns
 //===----------------------------------------------------------------------===//
 
-// TBUFFER_STORE_FORMAT_*, addr64=0
-class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
-  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
-                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
-                   imm:$nfmt, imm:$offen, imm:$idxen,
-                   imm:$glc, imm:$slc, imm:$tfe),
-  (opcode
-    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
-    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
-    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
->;
+//===----------------------------------------------------------------------===//
+// tbuffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                  string opcode> {
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32,   "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32,   "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
+
+multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                   string opcode> {
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
+                                (as_i16imm $offset), (as_i8imm $dfmt),
+                                (as_i8imm $nfmt), (as_i1imm $glc),
+                                (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt),
+                                   (as_i8imm $nfmt), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt),
+                                   (as_i8imm $nfmt), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
+          imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
+      $vdata,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
 
-def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
-def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
-def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
-def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32,   "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32,   "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
 
 } // End let Predicates = [isGCN]
 
@@ -1224,21 +1433,44 @@
 
 class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
   let AssemblerPredicate=isSICI;
   let DecoderNamespace="SICI";
 
-  bits<1> addr64;
-  let Inst{15}    = addr64;
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{15}    = ps.addr64;
   let Inst{18-16} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);  
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_si  : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_si    : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_si   : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_si  : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
+  def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
+  def _OFFEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
 
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_si <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_si <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_si <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_si <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_si <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_si <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
 
 //===----------------------------------------------------------------------===//
 // CI
@@ -1350,16 +1582,39 @@
 
 class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
   let AssemblerPredicate=isVI;
   let DecoderNamespace="VI";
 
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
   let Inst{18-15} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_vi  : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_vi    : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_vi   : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_vi  : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
+  def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
 
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_vi <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_vi <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_vi <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_vi <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_vi <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_vi <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_vi <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
Index: lib/Target/AMDGPU/FLATInstructions.td
===================================================================
--- lib/Target/AMDGPU/FLATInstructions.td
+++ lib/Target/AMDGPU/FLATInstructions.td
@@ -31,8 +31,6 @@
   let VM_CNT = 1;
   let LGKM_CNT = 1;
 
-  let Uses = [EXEC, FLAT_SCR]; // M0
-
   let UseNamedOperandTable = 1;
   let hasSideEffects = 0;
   let SchedRW = [WriteVMEM];
@@ -40,10 +38,16 @@
   string Mnemonic = opName;
   string AsmOperands = asmOps;
 
+  bits<1> is_flat_global = 0;
+  bits<1> is_flat_scratch = 0;
+
   bits<1> has_vdst = 1;
   bits<1> has_data = 1;
   bits<1> has_glc  = 1;
   bits<1> glcValue = 0;
+
+  // TODO: M0 if it could possibly access LDS (before gfx9? only)?
+  let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]);
 }
 
 class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -68,7 +72,10 @@
 
   // Only valid on gfx9
   bits<1> lds = 0; // XXX - What does this actually do?
-  bits<2> seg; // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+
+  // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+  bits<2> seg = !if(ps.is_flat_global, 0b10,
+                  !if(ps.is_flat_scratch, 0b01, 0));
 
   // Signed offset. Highest bit ignored for flat and treated as 12-bit
   // unsigned for flat acceses.
@@ -81,7 +88,7 @@
   // Only valid on GFX9+
   let Inst{12-0} = offset;
   let Inst{13} = lds;
-  let Inst{15-14} = 0;
+  let Inst{15-14} = seg;
 
   let Inst{16}    = !if(ps.has_glc, glc, ps.glcValue);
   let Inst{17}    = slc;
@@ -106,6 +113,16 @@
   let mayLoad = 1;
 }
 
+class FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Load_Pseudo<opName, regClass, 1> {
+  let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Load_Pseudo<opName, regClass, 1> {
+  let is_flat_scratch = 1;
+}
+
 class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
   bit HasSignedOffset = 0> : FLAT_Pseudo<
   opName,
@@ -119,6 +136,16 @@
   let has_vdst = 0;
 }
 
+class FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Store_Pseudo<opName, regClass, 1> {
+  let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Store_Pseudo<opName, regClass, 1> {
+  let is_flat_scratch = 1;
+}
+
 multiclass FLAT_Atomic_Pseudo<
   string opName,
   RegisterClass vdst_rc,
@@ -306,6 +333,26 @@
 
 } // End SubtargetPredicate = isCI
 
+let SubtargetPredicate = HasFlatGlobalInsts in {
+def GLOBAL_LOAD_UBYTE    : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
+def GLOBAL_LOAD_SBYTE    : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
+def GLOBAL_LOAD_USHORT   : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
+def GLOBAL_LOAD_SSHORT   : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>;
+def GLOBAL_LOAD_DWORD    : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>;
+def GLOBAL_LOAD_DWORDX2  : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>;
+def GLOBAL_LOAD_DWORDX3  : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>;
+def GLOBAL_LOAD_DWORDX4  : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>;
+
+def GLOBAL_STORE_BYTE    : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
+def GLOBAL_STORE_SHORT   : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
+def GLOBAL_STORE_DWORD   : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>;
+def GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
+def GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
+def GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+
+} // End SubtargetPredicate = HasFlatGlobalInsts
+
+
 //===----------------------------------------------------------------------===//
 // Flat Patterns
 //===----------------------------------------------------------------------===//
@@ -557,3 +604,18 @@
 defm FLAT_ATOMIC_INC_X2     : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
 defm FLAT_ATOMIC_DEC_X2     : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
 
+def GLOBAL_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, GLOBAL_LOAD_UBYTE>;
+def GLOBAL_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, GLOBAL_LOAD_SBYTE>;
+def GLOBAL_LOAD_USHORT_vi : FLAT_Real_vi <0x12, GLOBAL_LOAD_USHORT>;
+def GLOBAL_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, GLOBAL_LOAD_SSHORT>;
+def GLOBAL_LOAD_DWORD_vi : FLAT_Real_vi <0x14, GLOBAL_LOAD_DWORD>;
+def GLOBAL_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, GLOBAL_LOAD_DWORDX2>;
+def GLOBAL_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, GLOBAL_LOAD_DWORDX4>;
+def GLOBAL_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, GLOBAL_LOAD_DWORDX3>;
+
+def GLOBAL_STORE_BYTE_vi : FLAT_Real_vi <0x18, GLOBAL_STORE_BYTE>;
+def GLOBAL_STORE_SHORT_vi : FLAT_Real_vi <0x1a, GLOBAL_STORE_SHORT>;
+def GLOBAL_STORE_DWORD_vi : FLAT_Real_vi <0x1c, GLOBAL_STORE_DWORD>;
+def GLOBAL_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, GLOBAL_STORE_DWORDX2>;
+def GLOBAL_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, GLOBAL_STORE_DWORDX4>;
+def GLOBAL_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, GLOBAL_STORE_DWORDX3>;
Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
===================================================================
--- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -42,6 +42,7 @@
   void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
@@ -52,6 +53,9 @@
   void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                      raw_ostream &O);
+
   void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
   void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -84,7 +88,11 @@
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpVM(const MCInst *MI, unsigned OpNo,
                   const MCSubtargetInfo &STI, raw_ostream &O);
-
+  void printDFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
+  
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -72,6 +72,11 @@
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
 }
 
+void AMDGPUInstPrinter::printS16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  O << formatDec(static_cast<int16_t>(MI->getOperand(OpNo).getImm()));
+}
+
 void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
@@ -118,6 +123,16 @@
   }
 }
 
+void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  uint16_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm != 0) {
+    O << ((OpNo == 0)? "offset:" : " offset:");
+    printS16ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
@@ -216,6 +231,24 @@
     O << " vm";
 }
 
+void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " dfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " nfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
                                         const MCRegisterInfo &MRI) {
   switch (RegNo) {
Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp
===================================================================
--- lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -174,6 +174,31 @@
   return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
 }
 
+static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
+                                      const SIRegisterInfo *TRI,
+                                      const SIInstrInfo *TII) {
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  auto &Src = MI.getOperand(1);
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = Src.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      !TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+
+  for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
+    const auto *UseMI = MO.getParent();
+    if (UseMI == &MI)
+      continue;
+    if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
+        UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
+        !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
+      return false;
+  }
+  // Change VGPR to SGPR destination.
+  MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
+  return true;
+}
+
 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
 //
 // SGPRx = ...
@@ -214,6 +239,9 @@
   if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
     return false;
 
+  if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII))
+    return true;
+
   // TODO: Could have multiple extracts?
   unsigned SubReg = CopyUse.getOperand(1).getSubReg();
   if (SubReg != AMDGPU::NoSubRegister)
@@ -563,6 +591,8 @@
             break;
           }
           TII->moveToVALU(MI);
+        } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
+          tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
         }
 
         break;
Index: lib/Target/AMDGPU/SIFoldOperands.cpp
===================================================================
--- lib/Target/AMDGPU/SIFoldOperands.cpp
+++ lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -13,6 +13,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -166,6 +167,8 @@
   if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
       TargetRegisterInfo::isVirtualRegister(New->getReg())) {
     Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+
+    Old.setIsUndef(New->isUndef());
     return true;
   }
 
@@ -470,7 +473,7 @@
       return &Op;
 
     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
-    if (Def->isMoveImmediate()) {
+    if (Def && Def->isMoveImmediate()) {
       MachineOperand &ImmSrc = Def->getOperand(1);
       if (ImmSrc.isImm())
         return &ImmSrc;
@@ -921,12 +924,9 @@
   // level.
   bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
+  for (MachineBasicBlock *MBB : depth_first(&MF)) {
     MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    for (I = MBB->begin(); I != MBB->end(); I = Next) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3288,6 +3288,8 @@
                                                  SelectionDAG &DAG) const {
   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   SDLoc DL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
   case Intrinsic::amdgcn_atomic_dec: {
@@ -3313,7 +3315,6 @@
       Op.getOperand(5), // glc
       Op.getOperand(6)  // slc
     };
-    MachineFunction &MF = DAG.getMachineFunction();
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
@@ -3328,6 +3329,29 @@
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
   }
+  case Intrinsic::amdgcn_tbuffer_load: {
+    SDValue Ops[] = {
+      Op.getOperand(0),  // Chain
+      Op.getOperand(2),  // rsrc
+      Op.getOperand(3),  // vindex
+      Op.getOperand(4),  // voffset
+      Op.getOperand(5),  // soffset
+      Op.getOperand(6),  // offset
+      Op.getOperand(7),  // dfmt
+      Op.getOperand(8),  // nfmt
+      Op.getOperand(9),  // glc
+      Op.getOperand(10)   // slc
+    };
+
+    EVT VT = Op.getOperand(2).getValueType();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad,
+      VT.getStoreSize(), VT.getStoreSize());
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);    
+  }
   // Basic sample.
   case Intrinsic::amdgcn_image_sample:
   case Intrinsic::amdgcn_image_sample_cl:
@@ -3393,11 +3417,11 @@
 
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-
+  MachineFunction &MF = DAG.getMachineFunction();
+  
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_exp: {
     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
@@ -3463,33 +3487,6 @@
     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
                        Op.getOperand(2), Op.getOperand(3));
   }
-  case AMDGPUIntrinsic::SI_tbuffer_store: {
-    SDValue Ops[] = {
-      Chain,
-      Op.getOperand(2),
-      Op.getOperand(3),
-      Op.getOperand(4),
-      Op.getOperand(5),
-      Op.getOperand(6),
-      Op.getOperand(7),
-      Op.getOperand(8),
-      Op.getOperand(9),
-      Op.getOperand(10),
-      Op.getOperand(11),
-      Op.getOperand(12),
-      Op.getOperand(13),
-      Op.getOperand(14)
-    };
-
-    EVT VT = Op.getOperand(3).getValueType();
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
-  }
   case AMDGPUIntrinsic::AMDGPU_kill: {
     SDValue Src = Op.getOperand(2);
     if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3505,7 +3502,6 @@
   }
   case Intrinsic::amdgcn_s_barrier: {
     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
-      const MachineFunction &MF = DAG.getMachineFunction();
       const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
       unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
       if (WGSize <= ST.getWavefrontSize())
@@ -3514,6 +3510,76 @@
     }
     return SDValue();
   };
+  case AMDGPUIntrinsic::SI_tbuffer_store: {
+
+    // Extract vindex and voffset from vaddr as appropriate
+    const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
+    const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
+    SDValue VAddr = Op.getOperand(5);
+
+    SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+
+    assert(!(OffEn->isOne() && IdxEn->isOne()) &&
+           "Legacy intrinsic doesn't support both offset and index - use new version");
+
+    SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
+    SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
+
+    // Deal with the vec-3 case
+    const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
+    auto Opcode = NumChannels->getZExtValue() == 3 ?
+      AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
+    
+    SDValue Ops[] = {
+     Chain,
+     Op.getOperand(3),  // vdata
+     Op.getOperand(2),  // rsrc
+     VIndex,
+     VOffset,
+     Op.getOperand(6),  // soffset
+     Op.getOperand(7),  // inst_offset
+     Op.getOperand(8),  // dfmt
+     Op.getOperand(9),  // nfmt
+     Op.getOperand(12), // glc
+     Op.getOperand(13), // slc
+    };
+
+    const ConstantSDNode *tfe = cast<ConstantSDNode>(Op.getOperand(14));
+    assert(tfe->getZExtValue() == 0 &&
+           "Value of tfe other than zero is unsupported");
+   
+    EVT VT = Op.getOperand(3).getValueType();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(Opcode, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+
+  case Intrinsic::amdgcn_tbuffer_store: {
+    SDValue Ops[] = {
+      Chain,
+      Op.getOperand(2),  // vdata
+      Op.getOperand(3),  // rsrc
+      Op.getOperand(4),  // vindex
+      Op.getOperand(5),  // voffset
+      Op.getOperand(6),  // soffset
+      Op.getOperand(7),  // offset
+      Op.getOperand(8),  // dfmt
+      Op.getOperand(9),  // nfmt
+      Op.getOperand(10), // glc
+      Op.getOperand(11)  // slc
+    };
+    EVT VT = Op.getOperand(3).getValueType();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+
   default:
     return Op;
   }
Index: lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.td
+++ lib/Target/AMDGPU/SIInstrInfo.td
@@ -39,25 +39,41 @@
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
-def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
-  SDTypeProfile<0, 13,
-    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
-     SDTCisVT<1, iAny>,   // vdata(VGPR)
-     SDTCisVT<2, i32>,    // num_channels(imm)
-     SDTCisVT<3, i32>,    // vaddr(VGPR)
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
+  SDTypeProfile<1, 9,
+    [                     // vdata
+     SDTCisVT<1, v4i32>,  // rsrc
+     SDTCisVT<2, i32>,    // vindex(VGPR)
+     SDTCisVT<3, i32>,    // voffset(VGPR)
      SDTCisVT<4, i32>,    // soffset(SGPR)
-     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<5, i32>,    // offset(imm)
      SDTCisVT<6, i32>,    // dfmt(imm)
      SDTCisVT<7, i32>,    // nfmt(imm)
-     SDTCisVT<8, i32>,    // offen(imm)
-     SDTCisVT<9, i32>,    // idxen(imm)
-     SDTCisVT<10, i32>,   // glc(imm)
-     SDTCisVT<11, i32>,   // slc(imm)
-     SDTCisVT<12, i32>    // tfe(imm)
+     SDTCisVT<8, i32>,    // glc(imm)
+     SDTCisVT<9, i32>     // slc(imm)
     ]>,
-  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+  [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
 >;
 
+def SDTtbuffer_store : SDTypeProfile<0, 10,
+    [                     // vdata
+     SDTCisVT<1, v4i32>,  // rsrc
+     SDTCisVT<2, i32>,    // vindex(VGPR)
+     SDTCisVT<3, i32>,    // voffset(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // glc(imm)
+     SDTCisVT<9, i32>     // slc(imm)
+    ]>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
+                             [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
+                                SDTtbuffer_store,
+                                [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+
 def SDTBufferLoad : SDTypeProfile<1, 5,
     [                    // vdata
      SDTCisVT<1, v4i32>, // rsrc
@@ -525,7 +541,7 @@
 def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
 
 def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>;
-def offset_s13 : NamedOperandS13<"Offset", NamedMatchClass<"OffsetS13">>;
+def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>;
 def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
 def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
 def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
@@ -545,6 +561,9 @@
 def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
 def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
 
+def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
+def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
+
 def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
 def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp
===================================================================
--- lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -92,6 +92,8 @@
 
       case AMDGPU::V_ADDC_U32_e64:
       case AMDGPU::V_SUBB_U32_e64:
+        if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm())
+          return false;
         // Additional verification is needed for sdst/src2.
         return true;
 
Index: lib/Target/PowerPC/PPCTargetMachine.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetMachine.cpp
+++ lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -388,7 +388,7 @@
   // FIXME: We probably don't need to run these for -fPIE.
   if (getPPCTargetMachine().isPositionIndependent()) {
     // FIXME: LiveVariables should not be necessary here!
-    // PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on
+    // PPCTLSDynamicCallPass uses LiveIntervals which previously dependent on
     // LiveVariables. This (unnecessary) dependency has been removed now,
     // however a stage-2 clang build fails without LiveVariables computed here.
     addPass(&LiveVariablesID, false);
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -1662,6 +1662,12 @@
   MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
+
+  // TODO: These control memcmp expansion in CGP and are set low to prevent
+  // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder.
+  MaxLoadsPerMemcmp = 1;
+  MaxLoadsPerMemcmpOptSize = 1;
+
   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
 
@@ -14272,7 +14278,8 @@
   // If we are inserting a element, see if we can do this more efficiently with
   // a blend shuffle with a rematerializable vector than a costly integer
   // insertion.
-  if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
+  if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
+      16 <= EltVT.getSizeInBits()) {
     SmallVector<int, 8> BlendMask;
     for (unsigned i = 0; i != NumElts; ++i)
       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -107,7 +107,7 @@
   bool isLegalMaskedScatter(Type *DataType);
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
-
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
 private:
   int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2232,6 +2232,12 @@
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
+bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  // TODO: We can increase these based on available vector ops.
+  MaxLoadSize = ST->is64Bit() ? 8 : 4;
+  return true;
+}
+
 bool X86TTIImpl::enableInterleavedAccessVectorization() {
   // TODO: We expect this to be beneficial regardless of arch,
   // but there are currently some unexplained performance artifacts on Atom.
Index: lib/ToolDrivers/llvm-lib/LibDriver.cpp
===================================================================
--- lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -31,7 +31,7 @@
 
 enum {
   OPT_INVALID = 0,
-#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11) OPT_##ID,
+#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID,
 #include "Options.inc"
 #undef OPTION
 };
@@ -41,11 +41,9 @@
 #undef PREFIX
 
 static const llvm::opt::OptTable::Info infoTable[] = {
-#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10)    \
-  {                                                                    \
-    X1, X2, X9, X10, OPT_##ID, llvm::opt::Option::KIND##Class, X8, X7, \
-    OPT_##GROUP, OPT_##ALIAS, X6                                       \
-  },
+#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10, X11)       \
+  {X1, X2, X9,          X10,         OPT_##ID, llvm::opt::Option::KIND##Class, \
+   X8, X7, OPT_##GROUP, OPT_##ALIAS, X6,       X11},
 #include "Options.inc"
 #undef OPTION
 };
Index: lib/Transforms/Scalar/NewGVN.cpp
===================================================================
--- lib/Transforms/Scalar/NewGVN.cpp
+++ lib/Transforms/Scalar/NewGVN.cpp
@@ -3025,12 +3025,10 @@
       // It's okay to have the same expression already in there if it is
       // identical in nature.
       // This can happen when the leader of the stored value changes over time.
-      if (!Okay) {
-        Okay = Okay && std::get<1>(Res.first->second) == KV.second;
-        Okay = Okay &&
-               lookupOperandLeader(std::get<2>(Res.first->second)) ==
-                   lookupOperandLeader(SE->getStoredValue());
-      }
+      if (!Okay)
+        Okay = (std::get<1>(Res.first->second) == KV.second) &&
+               (lookupOperandLeader(std::get<2>(Res.first->second)) ==
+                lookupOperandLeader(SE->getStoredValue()));
       assert(Okay && "Stored expression conflict exists in expression table");
       auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst());
       assert(ValueExpr && ValueExpr->equals(*SE) &&
Index: test/Analysis/CostModel/X86/arith.ll
===================================================================
--- test/Analysis/CostModel/X86/arith.ll
+++ test/Analysis/CostModel/X86/arith.ll
@@ -1,516 +1,564 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSSE3
-; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42
-; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
-; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
-; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSSE3
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK-LABEL: 'add'
 define i32 @add(i32 %arg) {
-  ; SSSE3: cost of 1 {{.*}} %A = add
-  ; SSE42: cost of 1 {{.*}} %A = add
-  ; AVX: cost of 1 {{.*}} %A = add
-  ; AVX2: cost of 1 {{.*}} %A = add
-  ; AVX512: cost of 1 {{.*}} %A = add
-  %A = add <2 x i64> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %B = add
-  ; SSE42: cost of 2 {{.*}} %B = add
-  ; AVX: cost of 4 {{.*}} %B = add
-  ; AVX2: cost of 1 {{.*}} %B = add
-  ; AVX512: cost of 1 {{.*}} %B = add
-  %B = add <4 x i64> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %C = add
-  ; SSE42: cost of 4 {{.*}} %C = add
-  ; AVX: cost of 8 {{.*}} %C = add
-  ; AVX2: cost of 2 {{.*}} %C = add
-  ; AVX512: cost of 1 {{.*}} %C = add
-  %C = add <8 x i64> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %D = add
-  ; SSE42: cost of 1 {{.*}} %D = add
-  ; AVX: cost of 1 {{.*}} %D = add
-  ; AVX2: cost of 1 {{.*}} %D = add
-  ; AVX512: cost of 1 {{.*}} %D = add
-  %D = add <4 x i32> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %E = add
-  ; SSE42: cost of 2 {{.*}} %E = add
-  ; AVX: cost of 4 {{.*}} %E = add
-  ; AVX2: cost of 1 {{.*}} %E = add
-  ; AVX512: cost of 1 {{.*}} %E = add
-  %E = add <8 x i32> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %F = add
-  ; SSE42: cost of 4 {{.*}} %F = add
-  ; AVX: cost of 8 {{.*}} %F = add
-  ; AVX2: cost of 2 {{.*}} %F = add
-  ; AVX512: cost of 1 {{.*}} %F = add
-  %F = add <16 x i32> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %G = add
-  ; SSE42: cost of 1 {{.*}} %G = add
-  ; AVX: cost of 1 {{.*}} %G = add
-  ; AVX2: cost of 1 {{.*}} %G = add
-  ; AVX512: cost of 1 {{.*}} %G = add
-  %G = add <8 x i16> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %H = add
-  ; SSE42: cost of 2 {{.*}} %H = add
-  ; AVX: cost of 4 {{.*}} %H = add
-  ; AVX2: cost of 1 {{.*}} %H = add
-  ; AVX512: cost of 1 {{.*}} %H = add
-  %H = add <16 x i16> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %I = add
-  ; SSE42: cost of 4 {{.*}} %I = add
-  ; AVX: cost of 8 {{.*}} %I = add
-  ; AVX2: cost of 2 {{.*}} %I = add
-  ; AVX512F: cost of 2 {{.*}} %I = add
-  ; AVX512BW: cost of 1 {{.*}} %I = add
-  %I = add <32 x i16> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %J = add
-  ; SSE42: cost of 1 {{.*}} %J = add
-  ; AVX: cost of 1 {{.*}} %J = add
-  ; AVX2: cost of 1 {{.*}} %J = add
-  ; AVX512: cost of 1 {{.*}} %J = add
-  %J = add <16 x i8> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %K = add
-  ; SSE42: cost of 2 {{.*}} %K = add
-  ; AVX: cost of 4 {{.*}} %K = add
-  ; AVX2: cost of 1 {{.*}} %K = add
-  ; AVX512: cost of 1 {{.*}} %K = add
-  %K = add <32 x i8> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %L = add
-  ; SSE42: cost of 4 {{.*}} %L = add
-  ; AVX: cost of 8 {{.*}} %L = add
-  ; AVX2: cost of 2 {{.*}} %L = add
-  ; AVX512F: cost of 2 {{.*}} %L = add
-  ; AVX512BW: cost of 1 {{.*}} %L = add
-  %L = add <64 x i8> undef, undef
+  ; CHECK: cost of 1 {{.*}} %I64 = add
+  %I64 = add i64 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V2I64 = add
+  ; SSE42: cost of 1 {{.*}} %V2I64 = add
+  ; AVX: cost of 1 {{.*}} %V2I64 = add
+  ; AVX2: cost of 1 {{.*}} %V2I64 = add
+  ; AVX512: cost of 1 {{.*}} %V2I64 = add
+  %V2I64 = add <2 x i64> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V4I64 = add
+  ; SSE42: cost of 2 {{.*}} %V4I64 = add
+  ; AVX: cost of 4 {{.*}} %V4I64 = add
+  ; AVX2: cost of 1 {{.*}} %V4I64 = add
+  ; AVX512: cost of 1 {{.*}} %V4I64 = add
+  %V4I64 = add <4 x i64> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V8I64 = add
+  ; SSE42: cost of 4 {{.*}} %V8I64 = add
+  ; AVX: cost of 8 {{.*}} %V8I64 = add
+  ; AVX2: cost of 2 {{.*}} %V8I64 = add
+  ; AVX512: cost of 1 {{.*}} %V8I64 = add
+  %V8I64 = add <8 x i64> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I32 = add
+  %I32 = add i32 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V4I32 = add
+  ; SSE42: cost of 1 {{.*}} %V4I32 = add
+  ; AVX: cost of 1 {{.*}} %V4I32 = add
+  ; AVX2: cost of 1 {{.*}} %V4I32 = add
+  ; AVX512: cost of 1 {{.*}} %V4I32 = add
+  %V4I32 = add <4 x i32> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V8I32 = add
+  ; SSE42: cost of 2 {{.*}} %V8I32 = add
+  ; AVX: cost of 4 {{.*}} %V8I32 = add
+  ; AVX2: cost of 1 {{.*}} %V8I32 = add
+  ; AVX512: cost of 1 {{.*}} %V8I32 = add
+  %V8I32 = add <8 x i32> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V16I32 = add
+  ; SSE42: cost of 4 {{.*}} %V16I32 = add
+  ; AVX: cost of 8 {{.*}} %V16I32 = add
+  ; AVX2: cost of 2 {{.*}} %V16I32 = add
+  ; AVX512: cost of 1 {{.*}} %V16I32 = add
+  %V16I32 = add <16 x i32> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I16 = add
+  %I16 = add i16 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V8I16 = add
+  ; SSE42: cost of 1 {{.*}} %V8I16 = add
+  ; AVX: cost of 1 {{.*}} %V8I16 = add
+  ; AVX2: cost of 1 {{.*}} %V8I16 = add
+  ; AVX512: cost of 1 {{.*}} %V8I16 = add
+  %V8I16 = add <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V16I16 = add
+  ; SSE42: cost of 2 {{.*}} %V16I16 = add
+  ; AVX: cost of 4 {{.*}} %V16I16 = add
+  ; AVX2: cost of 1 {{.*}} %V16I16 = add
+  ; AVX512: cost of 1 {{.*}} %V16I16 = add
+  %V16I16 = add <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V32I16 = add
+  ; SSE42: cost of 4 {{.*}} %V32I16 = add
+  ; AVX: cost of 8 {{.*}} %V32I16 = add
+  ; AVX2: cost of 2 {{.*}} %V32I16 = add
+  ; AVX512F: cost of 2 {{.*}} %V32I16 = add
+  ; AVX512BW: cost of 1 {{.*}} %V32I16 = add
+  %V32I16 = add <32 x i16> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I8 = add
+  %I8 = add i8 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V16I8 = add
+  ; SSE42: cost of 1 {{.*}} %V16I8 = add
+  ; AVX: cost of 1 {{.*}} %V16I8 = add
+  ; AVX2: cost of 1 {{.*}} %V16I8 = add
+  ; AVX512: cost of 1 {{.*}} %V16I8 = add
+  %V16I8 = add <16 x i8> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V32I8 = add
+  ; SSE42: cost of 2 {{.*}} %V32I8 = add
+  ; AVX: cost of 4 {{.*}} %V32I8 = add
+  ; AVX2: cost of 1 {{.*}} %V32I8 = add
+  ; AVX512: cost of 1 {{.*}} %V32I8 = add
+  %V32I8 = add <32 x i8> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V64I8 = add
+  ; SSE42: cost of 4 {{.*}} %V64I8 = add
+  ; AVX: cost of 8 {{.*}} %V64I8 = add
+  ; AVX2: cost of 2 {{.*}} %V64I8 = add
+  ; AVX512F: cost of 2 {{.*}} %V64I8 = add
+  ; AVX512BW: cost of 1 {{.*}} %V64I8 = add
+  %V64I8 = add <64 x i8> undef, undef
 
   ret i32 undef
 }
 
 ; CHECK-LABEL: 'sub'
 define i32 @sub(i32 %arg) {
-  ; SSSE3: cost of 1 {{.*}} %A = sub
-  ; SSE42: cost of 1 {{.*}} %A = sub
-  ; AVX: cost of 1 {{.*}} %A = sub
-  ; AVX2: cost of 1 {{.*}} %A = sub
-  ; AVX512: cost of 1 {{.*}} %A = sub
-  %A = sub <2 x i64> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %B = sub
-  ; SSE42: cost of 2 {{.*}} %B = sub
-  ; AVX: cost of 4 {{.*}} %B = sub
-  ; AVX2: cost of 1 {{.*}} %B = sub
-  ; AVX512: cost of 1 {{.*}} %B = sub
-  %B = sub <4 x i64> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %C = sub
-  ; SSE42: cost of 4 {{.*}} %C = sub
-  ; AVX: cost of 8 {{.*}} %C = sub
-  ; AVX2: cost of 2 {{.*}} %C = sub
-  ; AVX512: cost of 1 {{.*}} %C = sub
-  %C = sub <8 x i64> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %D = sub
-  ; SSE42: cost of 1 {{.*}} %D = sub
-  ; AVX: cost of 1 {{.*}} %D = sub
-  ; AVX2: cost of 1 {{.*}} %D = sub
-  ; AVX512: cost of 1 {{.*}} %D = sub
-  %D = sub <4 x i32> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %E = sub
-  ; SSE42: cost of 2 {{.*}} %E = sub
-  ; AVX: cost of 4 {{.*}} %E = sub
-  ; AVX2: cost of 1 {{.*}} %E = sub
-  ; AVX512: cost of 1 {{.*}} %E = sub
-  %E = sub <8 x i32> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %F = sub
-  ; SSE42: cost of 4 {{.*}} %F = sub
-  ; AVX: cost of 8 {{.*}} %F = sub
-  ; AVX2: cost of 2 {{.*}} %F = sub
-  ; AVX512: cost of 1 {{.*}} %F = sub
-  %F = sub <16 x i32> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %G = sub
-  ; SSE42: cost of 1 {{.*}} %G = sub
-  ; AVX: cost of 1 {{.*}} %G = sub
-  ; AVX2: cost of 1 {{.*}} %G = sub
-  ; AVX512: cost of 1 {{.*}} %G = sub
-  %G = sub <8 x i16> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %H = sub
-  ; SSE42: cost of 2 {{.*}} %H = sub
-  ; AVX: cost of 4 {{.*}} %H = sub
-  ; AVX2: cost of 1 {{.*}} %H = sub
-  ; AVX512: cost of 1 {{.*}} %H = sub
-  %H = sub <16 x i16> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %I = sub
-  ; SSE42: cost of 4 {{.*}} %I = sub
-  ; AVX: cost of 8 {{.*}} %I = sub
-  ; AVX2: cost of 2 {{.*}} %I = sub
-  ; AVX512F: cost of 2 {{.*}} %I = sub
-  ; AVX512BW: cost of 1 {{.*}} %I = sub
-  %I = sub <32 x i16> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %J = sub
-  ; SSE42: cost of 1 {{.*}} %J = sub
-  ; AVX: cost of 1 {{.*}} %J = sub
-  ; AVX2: cost of 1 {{.*}} %J = sub
-  ; AVX512: cost of 1 {{.*}} %J = sub
-  %J = sub <16 x i8> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %K = sub
-  ; SSE42: cost of 2 {{.*}} %K = sub
-  ; AVX: cost of 4 {{.*}} %K = sub
-  ; AVX2: cost of 1 {{.*}} %K = sub
-  ; AVX512: cost of 1 {{.*}} %K = sub
-  %K = sub <32 x i8> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %L = sub
-  ; SSE42: cost of 4 {{.*}} %L = sub
-  ; AVX: cost of 8 {{.*}} %L = sub
-  ; AVX2: cost of 2 {{.*}} %L = sub
-  ; AVX512F: cost of 2 {{.*}} %L = sub
-  ; AVX512BW: cost of 1 {{.*}} %L = sub
-  %L = sub <64 x i8> undef, undef
+  ; CHECK: cost of 1 {{.*}} %I64 = sub
+  %I64 = sub i64 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V2I64 = sub
+  ; SSE42: cost of 1 {{.*}} %V2I64 = sub
+  ; AVX: cost of 1 {{.*}} %V2I64 = sub
+  ; AVX2: cost of 1 {{.*}} %V2I64 = sub
+  ; AVX512: cost of 1 {{.*}} %V2I64 = sub
+  %V2I64 = sub <2 x i64> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V4I64 = sub
+  ; SSE42: cost of 2 {{.*}} %V4I64 = sub
+  ; AVX: cost of 4 {{.*}} %V4I64 = sub
+  ; AVX2: cost of 1 {{.*}} %V4I64 = sub
+  ; AVX512: cost of 1 {{.*}} %V4I64 = sub
+  %V4I64 = sub <4 x i64> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V8I64 = sub
+  ; SSE42: cost of 4 {{.*}} %V8I64 = sub
+  ; AVX: cost of 8 {{.*}} %V8I64 = sub
+  ; AVX2: cost of 2 {{.*}} %V8I64 = sub
+  ; AVX512: cost of 1 {{.*}} %V8I64 = sub
+  %V8I64 = sub <8 x i64> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I32 = sub
+  %I32 = sub i32 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V4I32 = sub
+  ; SSE42: cost of 1 {{.*}} %V4I32 = sub
+  ; AVX: cost of 1 {{.*}} %V4I32 = sub
+  ; AVX2: cost of 1 {{.*}} %V4I32 = sub
+  ; AVX512: cost of 1 {{.*}} %V4I32 = sub
+  %V4I32 = sub <4 x i32> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V8I32 = sub
+  ; SSE42: cost of 2 {{.*}} %V8I32 = sub
+  ; AVX: cost of 4 {{.*}} %V8I32 = sub
+  ; AVX2: cost of 1 {{.*}} %V8I32 = sub
+  ; AVX512: cost of 1 {{.*}} %V8I32 = sub
+  %V8I32 = sub <8 x i32> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V16I32 = sub
+  ; SSE42: cost of 4 {{.*}} %V16I32 = sub
+  ; AVX: cost of 8 {{.*}} %V16I32 = sub
+  ; AVX2: cost of 2 {{.*}} %V16I32 = sub
+  ; AVX512: cost of 1 {{.*}} %V16I32 = sub
+  %V16I32 = sub <16 x i32> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I16 = sub
+  %I16 = sub i16 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V8I16 = sub
+  ; SSE42: cost of 1 {{.*}} %V8I16 = sub
+  ; AVX: cost of 1 {{.*}} %V8I16 = sub
+  ; AVX2: cost of 1 {{.*}} %V8I16 = sub
+  ; AVX512: cost of 1 {{.*}} %V8I16 = sub
+  %V8I16 = sub <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V16I16 = sub
+  ; SSE42: cost of 2 {{.*}} %V16I16 = sub
+  ; AVX: cost of 4 {{.*}} %V16I16 = sub
+  ; AVX2: cost of 1 {{.*}} %V16I16 = sub
+  ; AVX512: cost of 1 {{.*}} %V16I16 = sub
+  %V16I16 = sub <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V32I16 = sub
+  ; SSE42: cost of 4 {{.*}} %V32I16 = sub
+  ; AVX: cost of 8 {{.*}} %V32I16 = sub
+  ; AVX2: cost of 2 {{.*}} %V32I16 = sub
+  ; AVX512F: cost of 2 {{.*}} %V32I16 = sub
+  ; AVX512BW: cost of 1 {{.*}} %V32I16 = sub
+  %V32I16 = sub <32 x i16> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I8 = sub
+  %I8 = sub i8 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V16I8 = sub
+  ; SSE42: cost of 1 {{.*}} %V16I8 = sub
+  ; AVX: cost of 1 {{.*}} %V16I8 = sub
+  ; AVX2: cost of 1 {{.*}} %V16I8 = sub
+  ; AVX512: cost of 1 {{.*}} %V16I8 = sub
+  %V16I8 = sub <16 x i8> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V32I8 = sub
+  ; SSE42: cost of 2 {{.*}} %V32I8 = sub
+  ; AVX: cost of 4 {{.*}} %V32I8 = sub
+  ; AVX2: cost of 1 {{.*}} %V32I8 = sub
+  ; AVX512: cost of 1 {{.*}} %V32I8 = sub
+  %V32I8 = sub <32 x i8> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V64I8 = sub
+  ; SSE42: cost of 4 {{.*}} %V64I8 = sub
+  ; AVX: cost of 8 {{.*}} %V64I8 = sub
+  ; AVX2: cost of 2 {{.*}} %V64I8 = sub
+  ; AVX512F: cost of 2 {{.*}} %V64I8 = sub
+  ; AVX512BW: cost of 1 {{.*}} %V64I8 = sub
+  %V64I8 = sub <64 x i8> undef, undef
 
   ret i32 undef
 }
 
 ; CHECK-LABEL: 'or'
 define i32 @or(i32 %arg) {
-  ; SSSE3: cost of 1 {{.*}} %A = or
-  ; SSE42: cost of 1 {{.*}} %A = or
-  ; AVX: cost of 1 {{.*}} %A = or
-  ; AVX2: cost of 1 {{.*}} %A = or
-  ; AVX512: cost of 1 {{.*}} %A = or
-  %A = or <2 x i64> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %B = or
-  ; SSE42: cost of 2 {{.*}} %B = or
-  ; AVX: cost of 1 {{.*}} %B = or
-  ; AVX2: cost of 1 {{.*}} %B = or
-  ; AVX512: cost of 1 {{.*}} %B = or
-  %B = or <4 x i64> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %C = or
-  ; SSE42: cost of 4 {{.*}} %C = or
-  ; AVX: cost of 2 {{.*}} %C = or
-  ; AVX2: cost of 2 {{.*}} %C = or
-  ; AVX512: cost of 1 {{.*}} %C = or
-  %C = or <8 x i64> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %D = or
-  ; SSE42: cost of 1 {{.*}} %D = or
-  ; AVX: cost of 1 {{.*}} %D = or
-  ; AVX2: cost of 1 {{.*}} %D = or
-  ; AVX512: cost of 1 {{.*}} %D = or
-  %D = or <4 x i32> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %E = or
-  ; SSE42: cost of 2 {{.*}} %E = or
-  ; AVX: cost of 1 {{.*}} %E = or
-  ; AVX2: cost of 1 {{.*}} %E = or
-  ; AVX512: cost of 1 {{.*}} %E = or
-  %E = or <8 x i32> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %F = or
-  ; SSE42: cost of 4 {{.*}} %F = or
-  ; AVX: cost of 2 {{.*}} %F = or
-  ; AVX2: cost of 2 {{.*}} %F = or
-  ; AVX512: cost of 1 {{.*}} %F = or
-  %F = or <16 x i32> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %G = or
-  ; SSE42: cost of 1 {{.*}} %G = or
-  ; AVX: cost of 1 {{.*}} %G = or
-  ; AVX2: cost of 1 {{.*}} %G = or
-  ; AVX512: cost of 1 {{.*}} %G = or
-  %G = or <8 x i16> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %H = or
-  ; SSE42: cost of 2 {{.*}} %H = or
-  ; AVX: cost of 1 {{.*}} %H = or
-  ; AVX2: cost of 1 {{.*}} %H = or
-  ; AVX512: cost of 1 {{.*}} %H = or
-  %H = or <16 x i16> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %I = or
-  ; SSE42: cost of 4 {{.*}} %I = or
-  ; AVX: cost of 2 {{.*}} %I = or
-  ; AVX2: cost of 2 {{.*}} %I = or
-  ; AVX512F: cost of 2 {{.*}} %I = or
-  ; AVX512BW: cost of 1 {{.*}} %I = or
-  %I = or <32 x i16> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %J = or
-  ; SSE42: cost of 1 {{.*}} %J = or
-  ; AVX: cost of 1 {{.*}} %J = or
-  ; AVX2: cost of 1 {{.*}} %J = or
-  ; AVX512: cost of 1 {{.*}} %J = or
-  %J = or <16 x i8> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %K = or
-  ; SSE42: cost of 2 {{.*}} %K = or
-  ; AVX: cost of 1 {{.*}} %K = or
-  ; AVX2: cost of 1 {{.*}} %K = or
-  ; AVX512: cost of 1 {{.*}} %K = or
-  %K = or <32 x i8> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %L = or
-  ; SSE42: cost of 4 {{.*}} %L = or
-  ; AVX: cost of 2 {{.*}} %L = or
-  ; AVX2: cost of 2 {{.*}} %L = or
-  ; AVX512F: cost of 2 {{.*}} %L = or
-  ; AVX512BW: cost of 1 {{.*}} %L = or
-  %L = or <64 x i8> undef, undef
+  ; CHECK: cost of 1 {{.*}} %I64 = or
+  %I64 = or i64 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V2I64 = or
+  ; SSE42: cost of 1 {{.*}} %V2I64 = or
+  ; AVX: cost of 1 {{.*}} %V2I64 = or
+  ; AVX2: cost of 1 {{.*}} %V2I64 = or
+  ; AVX512: cost of 1 {{.*}} %V2I64 = or
+  %V2I64 = or <2 x i64> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V4I64 = or
+  ; SSE42: cost of 2 {{.*}} %V4I64 = or
+  ; AVX: cost of 1 {{.*}} %V4I64 = or
+  ; AVX2: cost of 1 {{.*}} %V4I64 = or
+  ; AVX512: cost of 1 {{.*}} %V4I64 = or
+  %V4I64 = or <4 x i64> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V8I64 = or
+  ; SSE42: cost of 4 {{.*}} %V8I64 = or
+  ; AVX: cost of 2 {{.*}} %V8I64 = or
+  ; AVX2: cost of 2 {{.*}} %V8I64 = or
+  ; AVX512: cost of 1 {{.*}} %V8I64 = or
+  %V8I64 = or <8 x i64> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I32 = or
+  %I32 = or i32 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V4I32 = or
+  ; SSE42: cost of 1 {{.*}} %V4I32 = or
+  ; AVX: cost of 1 {{.*}} %V4I32 = or
+  ; AVX2: cost of 1 {{.*}} %V4I32 = or
+  ; AVX512: cost of 1 {{.*}} %V4I32 = or
+  %V4I32 = or <4 x i32> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V8I32 = or
+  ; SSE42: cost of 2 {{.*}} %V8I32 = or
+  ; AVX: cost of 1 {{.*}} %V8I32 = or
+  ; AVX2: cost of 1 {{.*}} %V8I32 = or
+  ; AVX512: cost of 1 {{.*}} %V8I32 = or
+  %V8I32 = or <8 x i32> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V16I32 = or
+  ; SSE42: cost of 4 {{.*}} %V16I32 = or
+  ; AVX: cost of 2 {{.*}} %V16I32 = or
+  ; AVX2: cost of 2 {{.*}} %V16I32 = or
+  ; AVX512: cost of 1 {{.*}} %V16I32 = or
+  %V16I32 = or <16 x i32> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I16 = or
+  %I16 = or i16 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V8I16 = or
+  ; SSE42: cost of 1 {{.*}} %V8I16 = or
+  ; AVX: cost of 1 {{.*}} %V8I16 = or
+  ; AVX2: cost of 1 {{.*}} %V8I16 = or
+  ; AVX512: cost of 1 {{.*}} %V8I16 = or
+  %V8I16 = or <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V16I16 = or
+  ; SSE42: cost of 2 {{.*}} %V16I16 = or
+  ; AVX: cost of 1 {{.*}} %V16I16 = or
+  ; AVX2: cost of 1 {{.*}} %V16I16 = or
+  ; AVX512: cost of 1 {{.*}} %V16I16 = or
+  %V16I16 = or <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V32I16 = or
+  ; SSE42: cost of 4 {{.*}} %V32I16 = or
+  ; AVX: cost of 2 {{.*}} %V32I16 = or
+  ; AVX2: cost of 2 {{.*}} %V32I16 = or
+  ; AVX512F: cost of 2 {{.*}} %V32I16 = or
+  ; AVX512BW: cost of 1 {{.*}} %V32I16 = or
+  %V32I16 = or <32 x i16> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I8 = or
+  %I8 = or i8 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V16I8 = or
+  ; SSE42: cost of 1 {{.*}} %V16I8 = or
+  ; AVX: cost of 1 {{.*}} %V16I8 = or
+  ; AVX2: cost of 1 {{.*}} %V16I8 = or
+  ; AVX512: cost of 1 {{.*}} %V16I8 = or
+  %V16I8 = or <16 x i8> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V32I8 = or
+  ; SSE42: cost of 2 {{.*}} %V32I8 = or
+  ; AVX: cost of 1 {{.*}} %V32I8 = or
+  ; AVX2: cost of 1 {{.*}} %V32I8 = or
+  ; AVX512: cost of 1 {{.*}} %V32I8 = or
+  %V32I8 = or <32 x i8> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V64I8 = or
+  ; SSE42: cost of 4 {{.*}} %V64I8 = or
+  ; AVX: cost of 2 {{.*}} %V64I8 = or
+  ; AVX2: cost of 2 {{.*}} %V64I8 = or
+  ; AVX512F: cost of 2 {{.*}} %V64I8 = or
+  ; AVX512BW: cost of 1 {{.*}} %V64I8 = or
+  %V64I8 = or <64 x i8> undef, undef
 
   ret i32 undef
 }
 
 ; CHECK-LABEL: 'xor'
 define i32 @xor(i32 %arg) {
-  ; SSSE3: cost of 1 {{.*}} %A = xor
-  ; SSE42: cost of 1 {{.*}} %A = xor
-  ; AVX: cost of 1 {{.*}} %A = xor
-  ; AVX2: cost of 1 {{.*}} %A = xor
-  ; AVX512: cost of 1 {{.*}} %A = xor
-  %A = xor <2 x i64> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %B = xor
-  ; SSE42: cost of 2 {{.*}} %B = xor
-  ; AVX: cost of 1 {{.*}} %B = xor
-  ; AVX2: cost of 1 {{.*}} %B = xor
-  ; AVX512: cost of 1 {{.*}} %B = xor
-  %B = xor <4 x i64> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %C = xor
-  ; SSE42: cost of 4 {{.*}} %C = xor
-  ; AVX: cost of 2 {{.*}} %C = xor
-  ; AVX2: cost of 2 {{.*}} %C = xor
-  ; AVX512: cost of 1 {{.*}} %C = xor
-  %C = xor <8 x i64> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %D = xor
-  ; SSE42: cost of 1 {{.*}} %D = xor
-  ; AVX: cost of 1 {{.*}} %D = xor
-  ; AVX2: cost of 1 {{.*}} %D = xor
-  ; AVX512: cost of 1 {{.*}} %D = xor
-  %D = xor <4 x i32> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %E = xor
-  ; SSE42: cost of 2 {{.*}} %E = xor
-  ; AVX: cost of 1 {{.*}} %E = xor
-  ; AVX2: cost of 1 {{.*}} %E = xor
-  ; AVX512: cost of 1 {{.*}} %E = xor
-  %E = xor <8 x i32> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %F = xor
-  ; SSE42: cost of 4 {{.*}} %F = xor
-  ; AVX: cost of 2 {{.*}} %F = xor
-  ; AVX2: cost of 2 {{.*}} %F = xor
-  ; AVX512: cost of 1 {{.*}} %F = xor
-  %F = xor <16 x i32> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %G = xor
-  ; SSE42: cost of 1 {{.*}} %G = xor
-  ; AVX: cost of 1 {{.*}} %G = xor
-  ; AVX2: cost of 1 {{.*}} %G = xor
-  ; AVX512: cost of 1 {{.*}} %G = xor
-  %G = xor <8 x i16> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %H = xor
-  ; SSE42: cost of 2 {{.*}} %H = xor
-  ; AVX: cost of 1 {{.*}} %H = xor
-  ; AVX2: cost of 1 {{.*}} %H = xor
-  ; AVX512: cost of 1 {{.*}} %H = xor
-  %H = xor <16 x i16> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %I = xor
-  ; SSE42: cost of 4 {{.*}} %I = xor
-  ; AVX: cost of 2 {{.*}} %I = xor
-  ; AVX2: cost of 2 {{.*}} %I = xor
-  ; AVX512F: cost of 2 {{.*}} %I = xor
-  ; AVX512BW: cost of 1 {{.*}} %I = xor
-  %I = xor <32 x i16> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %J = xor
-  ; SSE42: cost of 1 {{.*}} %J = xor
-  ; AVX: cost of 1 {{.*}} %J = xor
-  ; AVX2: cost of 1 {{.*}} %J = xor
-  ; AVX512: cost of 1 {{.*}} %J = xor
-  %J = xor <16 x i8> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %K = xor
-  ; SSE42: cost of 2 {{.*}} %K = xor
-  ; AVX: cost of 1 {{.*}} %K = xor
-  ; AVX2: cost of 1 {{.*}} %K = xor
-  ; AVX512: cost of 1 {{.*}} %K = xor
-  %K = xor <32 x i8> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %L = xor
-  ; SSE42: cost of 4 {{.*}} %L = xor
-  ; AVX: cost of 2 {{.*}} %L = xor
-  ; AVX2: cost of 2 {{.*}} %L = xor
-  ; AVX512F: cost of 2 {{.*}} %L = xor
-  ; AVX512BW: cost of 1 {{.*}} %L = xor
-  %L = xor <64 x i8> undef, undef
+  ; CHECK: cost of 1 {{.*}} %I64 = xor
+  %I64 = xor i64 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V2I64 = xor
+  ; SSE42: cost of 1 {{.*}} %V2I64 = xor
+  ; AVX: cost of 1 {{.*}} %V2I64 = xor
+  ; AVX2: cost of 1 {{.*}} %V2I64 = xor
+  ; AVX512: cost of 1 {{.*}} %V2I64 = xor
+  %V2I64 = xor <2 x i64> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V4I64 = xor
+  ; SSE42: cost of 2 {{.*}} %V4I64 = xor
+  ; AVX: cost of 1 {{.*}} %V4I64 = xor
+  ; AVX2: cost of 1 {{.*}} %V4I64 = xor
+  ; AVX512: cost of 1 {{.*}} %V4I64 = xor
+  %V4I64 = xor <4 x i64> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V8I64 = xor
+  ; SSE42: cost of 4 {{.*}} %V8I64 = xor
+  ; AVX: cost of 2 {{.*}} %V8I64 = xor
+  ; AVX2: cost of 2 {{.*}} %V8I64 = xor
+  ; AVX512: cost of 1 {{.*}} %V8I64 = xor
+  %V8I64 = xor <8 x i64> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I32 = xor
+  %I32 = xor i32 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V4I32 = xor
+  ; SSE42: cost of 1 {{.*}} %V4I32 = xor
+  ; AVX: cost of 1 {{.*}} %V4I32 = xor
+  ; AVX2: cost of 1 {{.*}} %V4I32 = xor
+  ; AVX512: cost of 1 {{.*}} %V4I32 = xor
+  %V4I32 = xor <4 x i32> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V8I32 = xor
+  ; SSE42: cost of 2 {{.*}} %V8I32 = xor
+  ; AVX: cost of 1 {{.*}} %V8I32 = xor
+  ; AVX2: cost of 1 {{.*}} %V8I32 = xor
+  ; AVX512: cost of 1 {{.*}} %V8I32 = xor
+  %V8I32 = xor <8 x i32> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V16I32 = xor
+  ; SSE42: cost of 4 {{.*}} %V16I32 = xor
+  ; AVX: cost of 2 {{.*}} %V16I32 = xor
+  ; AVX2: cost of 2 {{.*}} %V16I32 = xor
+  ; AVX512: cost of 1 {{.*}} %V16I32 = xor
+  %V16I32 = xor <16 x i32> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I16 = xor
+  %I16 = xor i16 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V8I16 = xor
+  ; SSE42: cost of 1 {{.*}} %V8I16 = xor
+  ; AVX: cost of 1 {{.*}} %V8I16 = xor
+  ; AVX2: cost of 1 {{.*}} %V8I16 = xor
+  ; AVX512: cost of 1 {{.*}} %V8I16 = xor
+  %V8I16 = xor <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V16I16 = xor
+  ; SSE42: cost of 2 {{.*}} %V16I16 = xor
+  ; AVX: cost of 1 {{.*}} %V16I16 = xor
+  ; AVX2: cost of 1 {{.*}} %V16I16 = xor
+  ; AVX512: cost of 1 {{.*}} %V16I16 = xor
+  %V16I16 = xor <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V32I16 = xor
+  ; SSE42: cost of 4 {{.*}} %V32I16 = xor
+  ; AVX: cost of 2 {{.*}} %V32I16 = xor
+  ; AVX2: cost of 2 {{.*}} %V32I16 = xor
+  ; AVX512F: cost of 2 {{.*}} %V32I16 = xor
+  ; AVX512BW: cost of 1 {{.*}} %V32I16 = xor
+  %V32I16 = xor <32 x i16> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I8 = xor
+  %I8 = xor i8 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V16I8 = xor
+  ; SSE42: cost of 1 {{.*}} %V16I8 = xor
+  ; AVX: cost of 1 {{.*}} %V16I8 = xor
+  ; AVX2: cost of 1 {{.*}} %V16I8 = xor
+  ; AVX512: cost of 1 {{.*}} %V16I8 = xor
+  %V16I8 = xor <16 x i8> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V32I8 = xor
+  ; SSE42: cost of 2 {{.*}} %V32I8 = xor
+  ; AVX: cost of 1 {{.*}} %V32I8 = xor
+  ; AVX2: cost of 1 {{.*}} %V32I8 = xor
+  ; AVX512: cost of 1 {{.*}} %V32I8 = xor
+  %V32I8 = xor <32 x i8> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V64I8 = xor
+  ; SSE42: cost of 4 {{.*}} %V64I8 = xor
+  ; AVX: cost of 2 {{.*}} %V64I8 = xor
+  ; AVX2: cost of 2 {{.*}} %V64I8 = xor
+  ; AVX512F: cost of 2 {{.*}} %V64I8 = xor
+  ; AVX512BW: cost of 1 {{.*}} %V64I8 = xor
+  %V64I8 = xor <64 x i8> undef, undef
 
   ret i32 undef
 }
 
 ; CHECK-LABEL: 'and'
 define i32 @and(i32 %arg) {
-  ; SSSE3: cost of 1 {{.*}} %A = and
-  ; SSE42: cost of 1 {{.*}} %A = and
-  ; AVX: cost of 1 {{.*}} %A = and
-  ; AVX2: cost of 1 {{.*}} %A = and
-  ; AVX512: cost of 1 {{.*}} %A = and
-  %A = and <2 x i64> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %B = and
-  ; SSE42: cost of 2 {{.*}} %B = and
-  ; AVX: cost of 1 {{.*}} %B = and
-  ; AVX2: cost of 1 {{.*}} %B = and
-  ; AVX512: cost of 1 {{.*}} %B = and
-  %B = and <4 x i64> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %C = and
-  ; SSE42: cost of 4 {{.*}} %C = and
-  ; AVX: cost of 2 {{.*}} %C = and
-  ; AVX2: cost of 2 {{.*}} %C = and
-  ; AVX512: cost of 1 {{.*}} %C = and
-  %C = and <8 x i64> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %D = and
-  ; SSE42: cost of 1 {{.*}} %D = and
-  ; AVX: cost of 1 {{.*}} %D = and
-  ; AVX2: cost of 1 {{.*}} %D = and
-  ; AVX512: cost of 1 {{.*}} %D = and
-  %D = and <4 x i32> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %E = and
-  ; SSE42: cost of 2 {{.*}} %E = and
-  ; AVX: cost of 1 {{.*}} %E = and
-  ; AVX2: cost of 1 {{.*}} %E = and
-  ; AVX512: cost of 1 {{.*}} %E = and
-  %E = and <8 x i32> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %F = and
-  ; SSE42: cost of 4 {{.*}} %F = and
-  ; AVX: cost of 2 {{.*}} %F = and
-  ; AVX2: cost of 2 {{.*}} %F = and
-  ; AVX512: cost of 1 {{.*}} %F = and
-  %F = and <16 x i32> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %G = and
-  ; SSE42: cost of 1 {{.*}} %G = and
-  ; AVX: cost of 1 {{.*}} %G = and
-  ; AVX2: cost of 1 {{.*}} %G = and
-  ; AVX512: cost of 1 {{.*}} %G = and
-  %G = and <8 x i16> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %H = and
-  ; SSE42: cost of 2 {{.*}} %H = and
-  ; AVX: cost of 1 {{.*}} %H = and
-  ; AVX2: cost of 1 {{.*}} %H = and
-  ; AVX512: cost of 1 {{.*}} %H = and
-  %H = and <16 x i16> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %I = and
-  ; SSE42: cost of 4 {{.*}} %I = and
-  ; AVX: cost of 2 {{.*}} %I = and
-  ; AVX2: cost of 2 {{.*}} %I = and
-  ; AVX512F: cost of 2 {{.*}} %I = and
-  ; AVX512BW: cost of 1 {{.*}} %I = and
-  %I = and <32 x i16> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %J = and
-  ; SSE42: cost of 1 {{.*}} %J = and
-  ; AVX: cost of 1 {{.*}} %J = and
-  ; AVX2: cost of 1 {{.*}} %J = and
-  ; AVX512: cost of 1 {{.*}} %J = and
-  %J = and <16 x i8> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %K = and
-  ; SSE42: cost of 2 {{.*}} %K = and
-  ; AVX: cost of 1 {{.*}} %K = and
-  ; AVX2: cost of 1 {{.*}} %K = and
-  ; AVX512: cost of 1 {{.*}} %K = and
-  %K = and <32 x i8> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %L = and
-  ; SSE42: cost of 4 {{.*}} %L = and
-  ; AVX: cost of 2 {{.*}} %L = and
-  ; AVX2: cost of 2 {{.*}} %L = and
-  ; AVX512F: cost of 2 {{.*}} %L = and
-  ; AVX512BW: cost of 1 {{.*}} %L = and
-  %L = and <64 x i8> undef, undef
+  ; CHECK: cost of 1 {{.*}} %I64 = and
+  %I64 = and i64 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V2I64 = and
+  ; SSE42: cost of 1 {{.*}} %V2I64 = and
+  ; AVX: cost of 1 {{.*}} %V2I64 = and
+  ; AVX2: cost of 1 {{.*}} %V2I64 = and
+  ; AVX512: cost of 1 {{.*}} %V2I64 = and
+  %V2I64 = and <2 x i64> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V4I64 = and
+  ; SSE42: cost of 2 {{.*}} %V4I64 = and
+  ; AVX: cost of 1 {{.*}} %V4I64 = and
+  ; AVX2: cost of 1 {{.*}} %V4I64 = and
+  ; AVX512: cost of 1 {{.*}} %V4I64 = and
+  %V4I64 = and <4 x i64> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V8I64 = and
+  ; SSE42: cost of 4 {{.*}} %V8I64 = and
+  ; AVX: cost of 2 {{.*}} %V8I64 = and
+  ; AVX2: cost of 2 {{.*}} %V8I64 = and
+  ; AVX512: cost of 1 {{.*}} %V8I64 = and
+  %V8I64 = and <8 x i64> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I32 = and
+  %I32 = and i32 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V4I32 = and
+  ; SSE42: cost of 1 {{.*}} %V4I32 = and
+  ; AVX: cost of 1 {{.*}} %V4I32 = and
+  ; AVX2: cost of 1 {{.*}} %V4I32 = and
+  ; AVX512: cost of 1 {{.*}} %V4I32 = and
+  %V4I32 = and <4 x i32> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V8I32 = and
+  ; SSE42: cost of 2 {{.*}} %V8I32 = and
+  ; AVX: cost of 1 {{.*}} %V8I32 = and
+  ; AVX2: cost of 1 {{.*}} %V8I32 = and
+  ; AVX512: cost of 1 {{.*}} %V8I32 = and
+  %V8I32 = and <8 x i32> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V16I32 = and
+  ; SSE42: cost of 4 {{.*}} %V16I32 = and
+  ; AVX: cost of 2 {{.*}} %V16I32 = and
+  ; AVX2: cost of 2 {{.*}} %V16I32 = and
+  ; AVX512: cost of 1 {{.*}} %V16I32 = and
+  %V16I32 = and <16 x i32> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I16 = and
+  %I16 = and i16 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V8I16 = and
+  ; SSE42: cost of 1 {{.*}} %V8I16 = and
+  ; AVX: cost of 1 {{.*}} %V8I16 = and
+  ; AVX2: cost of 1 {{.*}} %V8I16 = and
+  ; AVX512: cost of 1 {{.*}} %V8I16 = and
+  %V8I16 = and <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V16I16 = and
+  ; SSE42: cost of 2 {{.*}} %V16I16 = and
+  ; AVX: cost of 1 {{.*}} %V16I16 = and
+  ; AVX2: cost of 1 {{.*}} %V16I16 = and
+  ; AVX512: cost of 1 {{.*}} %V16I16 = and
+  %V16I16 = and <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V32I16 = and
+  ; SSE42: cost of 4 {{.*}} %V32I16 = and
+  ; AVX: cost of 2 {{.*}} %V32I16 = and
+  ; AVX2: cost of 2 {{.*}} %V32I16 = and
+  ; AVX512F: cost of 2 {{.*}} %V32I16 = and
+  ; AVX512BW: cost of 1 {{.*}} %V32I16 = and
+  %V32I16 = and <32 x i16> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I8 = and
+  %I8 = and i8 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V16I8 = and
+  ; SSE42: cost of 1 {{.*}} %V16I8 = and
+  ; AVX: cost of 1 {{.*}} %V16I8 = and
+  ; AVX2: cost of 1 {{.*}} %V16I8 = and
+  ; AVX512: cost of 1 {{.*}} %V16I8 = and
+  %V16I8 = and <16 x i8> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V32I8 = and
+  ; SSE42: cost of 2 {{.*}} %V32I8 = and
+  ; AVX: cost of 1 {{.*}} %V32I8 = and
+  ; AVX2: cost of 1 {{.*}} %V32I8 = and
+  ; AVX512: cost of 1 {{.*}} %V32I8 = and
+  %V32I8 = and <32 x i8> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V64I8 = and
+  ; SSE42: cost of 4 {{.*}} %V64I8 = and
+  ; AVX: cost of 2 {{.*}} %V64I8 = and
+  ; AVX2: cost of 2 {{.*}} %V64I8 = and
+  ; AVX512F: cost of 2 {{.*}} %V64I8 = and
+  ; AVX512BW: cost of 1 {{.*}} %V64I8 = and
+  %V64I8 = and <64 x i8> undef, undef
 
   ret i32 undef
 }
 
 ; CHECK-LABEL: 'mul'
 define i32 @mul(i32 %arg) {
-  ; SSSE3: cost of 8 {{.*}} %A = mul
-  ; SSE42: cost of 8 {{.*}} %A = mul
-  ; AVX: cost of 8 {{.*}} %A = mul
-  ; AVX2: cost of 8 {{.*}} %A = mul
-  ; AVX512F: cost of 8 {{.*}} %A = mul
-  ; AVX512BW: cost of 8 {{.*}} %A = mul
-  ; AVX512DQ: cost of 1 {{.*}} %A = mul
-  %A = mul <2 x i64> undef, undef
-  ; SSSE3: cost of 16 {{.*}} %B = mul
-  ; SSE42: cost of 16 {{.*}} %B = mul
-  ; AVX: cost of 18 {{.*}} %B = mul
-  ; AVX2: cost of 8 {{.*}} %B = mul
-  ; AVX512F: cost of 8 {{.*}} %B = mul
-  ; AVX512BW: cost of 8 {{.*}} %B = mul
-  ; AVX512DQ: cost of 1 {{.*}} %B = mul
-  %B = mul <4 x i64> undef, undef
-  ; SSSE3: cost of 32 {{.*}} %C = mul
-  ; SSE42: cost of 32 {{.*}} %C = mul
-  ; AVX: cost of 36 {{.*}} %C = mul
-  ; AVX2: cost of 16 {{.*}} %C = mul
-  ; AVX512F: cost of 8 {{.*}} %C = mul
-  ; AVX512BW: cost of 8 {{.*}} %C = mul
-  ; AVX512DQ: cost of 1 {{.*}} %C = mul
-  %C = mul <8 x i64> undef, undef
-
-  ; SSSE3: cost of 6 {{.*}} %D = mul
-  ; SSE42: cost of 1 {{.*}} %D = mul
-  ; AVX: cost of 1 {{.*}} %D = mul
-  ; AVX2: cost of 1 {{.*}} %D = mul
-  ; AVX512: cost of 1 {{.*}} %D = mul
-  %D = mul <4 x i32> undef, undef
-  ; SSSE3: cost of 12 {{.*}} %E = mul
-  ; SSE42: cost of 2 {{.*}} %E = mul
-  ; AVX: cost of 4 {{.*}} %E = mul
-  ; AVX2: cost of 1 {{.*}} %E = mul
-  ; AVX512: cost of 1 {{.*}} %E = mul
-  %E = mul <8 x i32> undef, undef
-  ; SSSE3: cost of 24 {{.*}} %F = mul
-  ; SSE42: cost of 4 {{.*}} %F = mul
-  ; AVX: cost of 8 {{.*}} %F = mul
-  ; AVX2: cost of 2 {{.*}} %F = mul
-  ; AVX512: cost of 1 {{.*}} %F = mul
-  %F = mul <16 x i32> undef, undef
-
-  ; SSSE3: cost of 1 {{.*}} %G = mul
-  ; SSE42: cost of 1 {{.*}} %G = mul
-  ; AVX: cost of 1 {{.*}} %G = mul
-  ; AVX2: cost of 1 {{.*}} %G = mul
-  ; AVX512: cost of 1 {{.*}} %G = mul
-  %G = mul <8 x i16> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %H = mul
-  ; SSE42: cost of 2 {{.*}} %H = mul
-  ; AVX: cost of 4 {{.*}} %H = mul
-  ; AVX2: cost of 1 {{.*}} %H = mul
-  ; AVX512: cost of 1 {{.*}} %H = mul
-  %H = mul <16 x i16> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %I = mul
-  ; SSE42: cost of 4 {{.*}} %I = mul
-  ; AVX: cost of 8 {{.*}} %I = mul
-  ; AVX2: cost of 2 {{.*}} %I = mul
-  ; AVX512F: cost of 2 {{.*}} %I = mul
-  ; AVX512BW: cost of 1 {{.*}} %I = mul
-  %I = mul <32 x i16> undef, undef
-
-  ; SSSE3: cost of 12 {{.*}} %J = mul
-  ; SSE42: cost of 12 {{.*}} %J = mul
-  ; AVX: cost of 12 {{.*}} %J = mul
-  ; AVX2: cost of 7 {{.*}} %J = mul
-  ; AVX512F: cost of 5 {{.*}} %J = mul
-  ; AVX512BW: cost of 4 {{.*}} %J = mul
-  %J = mul <16 x i8> undef, undef
-  ; SSSE3: cost of 24 {{.*}} %K = mul
-  ; SSE42: cost of 24 {{.*}} %K = mul
-  ; AVX: cost of 26 {{.*}} %K = mul
-  ; AVX2: cost of 17 {{.*}} %K = mul
-  ; AVX512F: cost of 13 {{.*}} %K = mul
-  ; AVX512BW: cost of 4 {{.*}} %K = mul
-  %K = mul <32 x i8> undef, undef
-  ; SSSE3: cost of 48 {{.*}} %L = mul
-  ; SSE42: cost of 48 {{.*}} %L = mul
-  ; AVX: cost of 52 {{.*}} %L = mul
-  ; AVX2: cost of 34 {{.*}} %L = mul
-  ; AVX512F: cost of 26 {{.*}} %L = mul
-  ; AVX512BW: cost of 11 {{.*}} %L = mul
-  %L = mul <64 x i8> undef, undef
+  ; CHECK: cost of 1 {{.*}} %I64 = mul
+  %I64 = mul i64 undef, undef
+  ; SSSE3: cost of 8 {{.*}} %V2I64 = mul
+  ; SSE42: cost of 8 {{.*}} %V2I64 = mul
+  ; AVX: cost of 8 {{.*}} %V2I64 = mul
+  ; AVX2: cost of 8 {{.*}} %V2I64 = mul
+  ; AVX512F: cost of 8 {{.*}} %V2I64 = mul
+  ; AVX512BW: cost of 8 {{.*}} %V2I64 = mul
+  ; AVX512DQ: cost of 1 {{.*}} %V2I64 = mul
+  %V2I64 = mul <2 x i64> undef, undef
+  ; SSSE3: cost of 16 {{.*}} %V4I64 = mul
+  ; SSE42: cost of 16 {{.*}} %V4I64 = mul
+  ; AVX: cost of 18 {{.*}} %V4I64 = mul
+  ; AVX2: cost of 8 {{.*}} %V4I64 = mul
+  ; AVX512F: cost of 8 {{.*}} %V4I64 = mul
+  ; AVX512BW: cost of 8 {{.*}} %V4I64 = mul
+  ; AVX512DQ: cost of 1 {{.*}} %V4I64 = mul
+  %V4I64 = mul <4 x i64> undef, undef
+  ; SSSE3: cost of 32 {{.*}} %V8I64 = mul
+  ; SSE42: cost of 32 {{.*}} %V8I64 = mul
+  ; AVX: cost of 36 {{.*}} %V8I64 = mul
+  ; AVX2: cost of 16 {{.*}} %V8I64 = mul
+  ; AVX512F: cost of 8 {{.*}} %V8I64 = mul
+  ; AVX512BW: cost of 8 {{.*}} %V8I64 = mul
+  ; AVX512DQ: cost of 1 {{.*}} %V8I64 = mul
+  %V8I64 = mul <8 x i64> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I32 = mul
+  %I32 = mul i32 undef, undef
+  ; SSSE3: cost of 6 {{.*}} %V4I32 = mul
+  ; SSE42: cost of 1 {{.*}} %V4I32 = mul
+  ; AVX: cost of 1 {{.*}} %V4I32 = mul
+  ; AVX2: cost of 1 {{.*}} %V4I32 = mul
+  ; AVX512: cost of 1 {{.*}} %V4I32 = mul
+  %V4I32 = mul <4 x i32> undef, undef
+  ; SSSE3: cost of 12 {{.*}} %V8I32 = mul
+  ; SSE42: cost of 2 {{.*}} %V8I32 = mul
+  ; AVX: cost of 4 {{.*}} %V8I32 = mul
+  ; AVX2: cost of 1 {{.*}} %V8I32 = mul
+  ; AVX512: cost of 1 {{.*}} %V8I32 = mul
+  %V8I32 = mul <8 x i32> undef, undef
+  ; SSSE3: cost of 24 {{.*}} %V16I32 = mul
+  ; SSE42: cost of 4 {{.*}} %V16I32 = mul
+  ; AVX: cost of 8 {{.*}} %V16I32 = mul
+  ; AVX2: cost of 2 {{.*}} %V16I32 = mul
+  ; AVX512: cost of 1 {{.*}} %V16I32 = mul
+  %V16I32 = mul <16 x i32> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I16 = mul
+  %I16 = mul i16 undef, undef
+  ; SSSE3: cost of 1 {{.*}} %V8I16 = mul
+  ; SSE42: cost of 1 {{.*}} %V8I16 = mul
+  ; AVX: cost of 1 {{.*}} %V8I16 = mul
+  ; AVX2: cost of 1 {{.*}} %V8I16 = mul
+  ; AVX512: cost of 1 {{.*}} %V8I16 = mul
+  %V8I16 = mul <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %V16I16 = mul
+  ; SSE42: cost of 2 {{.*}} %V16I16 = mul
+  ; AVX: cost of 4 {{.*}} %V16I16 = mul
+  ; AVX2: cost of 1 {{.*}} %V16I16 = mul
+  ; AVX512: cost of 1 {{.*}} %V16I16 = mul
+  %V16I16 = mul <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %V32I16 = mul
+  ; SSE42: cost of 4 {{.*}} %V32I16 = mul
+  ; AVX: cost of 8 {{.*}} %V32I16 = mul
+  ; AVX2: cost of 2 {{.*}} %V32I16 = mul
+  ; AVX512F: cost of 2 {{.*}} %V32I16 = mul
+  ; AVX512BW: cost of 1 {{.*}} %V32I16 = mul
+  %V32I16 = mul <32 x i16> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I8 = mul
+  %I8 = mul i8 undef, undef
+  ; SSSE3: cost of 12 {{.*}} %V16I8 = mul
+  ; SSE42: cost of 12 {{.*}} %V16I8 = mul
+  ; AVX: cost of 12 {{.*}} %V16I8 = mul
+  ; AVX2: cost of 7 {{.*}} %V16I8 = mul
+  ; AVX512F: cost of 5 {{.*}} %V16I8 = mul
+  ; AVX512BW: cost of 4 {{.*}} %V16I8 = mul
+  %V16I8 = mul <16 x i8> undef, undef
+  ; SSSE3: cost of 24 {{.*}} %V32I8 = mul
+  ; SSE42: cost of 24 {{.*}} %V32I8 = mul
+  ; AVX: cost of 26 {{.*}} %V32I8 = mul
+  ; AVX2: cost of 17 {{.*}} %V32I8 = mul
+  ; AVX512F: cost of 13 {{.*}} %V32I8 = mul
+  ; AVX512BW: cost of 4 {{.*}} %V32I8 = mul
+  %V32I8 = mul <32 x i8> undef, undef
+  ; SSSE3: cost of 48 {{.*}} %V64I8 = mul
+  ; SSE42: cost of 48 {{.*}} %V64I8 = mul
+  ; AVX: cost of 52 {{.*}} %V64I8 = mul
+  ; AVX2: cost of 34 {{.*}} %V64I8 = mul
+  ; AVX512F: cost of 26 {{.*}} %V64I8 = mul
+  ; AVX512BW: cost of 11 {{.*}} %V64I8 = mul
+  %V64I8 = mul <64 x i8> undef, undef
 
   ret i32 undef
 }
Index: test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
===================================================================
--- test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1247,6 +1247,18 @@
   ret float %res
 }
 
+declare float @llvm.fma.f32(float, float, float)
+define float @test_fma_intrin(float %a, float %b, float %c) {
+; CHECK-LABEL: name: test_fma_intrin
+; CHECK: [[A:%[0-9]+]](s32) = COPY %s0
+; CHECK: [[B:%[0-9]+]](s32) = COPY %s1
+; CHECK: [[C:%[0-9]+]](s32) = COPY %s2
+; CHECK: [[RES:%[0-9]+]](s32) = G_FMA [[A]], [[B]], [[C]]
+; CHECK: %s0 = COPY [[RES]]
+  %res = call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %res
+}
+
 declare void @llvm.lifetime.start.p0i8(i64, i8*)
 declare void @llvm.lifetime.end.p0i8(i64, i8*)
 define void @test_lifetime_intrin() {
Index: test/CodeGen/AArch64/arm64-neon-copy.ll
===================================================================
--- test/CodeGen/AArch64/arm64-neon-copy.ll
+++ test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1378,7 +1378,7 @@
 
 define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
 ; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %vecext = extractelement <2 x i64> %x, i32 0
   %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
Index: test/CodeGen/AArch64/swiftself-scavenger.ll
===================================================================
--- test/CodeGen/AArch64/swiftself-scavenger.ll
+++ test/CodeGen/AArch64/swiftself-scavenger.ll
@@ -5,7 +5,7 @@
 ; CHECK: str [[REG:x[0-9]+]], [sp, #8]
 ; CHECK: add [[REG]], sp, #248
 ; CHECK: str xzr, [{{\s*}}[[REG]], #32760]
-; CHECK: ldr x30, [sp, #8]
+; CHECK: ldr [[REG]], [sp, #8]
 target triple = "arm64-apple-ios"
 
 @ptr8 = external global i8*
Index: test/CodeGen/AArch64/xray-attribute-instrumentation.ll
===================================================================
--- test/CodeGen/AArch64/xray-attribute-instrumentation.ll
+++ test/CodeGen/AArch64/xray-attribute-instrumentation.ll
@@ -25,9 +25,9 @@
 ; CHECK-NEXT:  ret
 }
 ; CHECK:       .p2align 4
-; CHECK-NEXT:  .xword .Lxray_synthetic_0
 ; CHECK-NEXT:  .xword .Lxray_fn_idx_synth_0
 ; CHECK-NEXT:  .section xray_instr_map,{{.*}}
-; CHECK-LABEL: Lxray_synthetic_0:
+; CHECK-LABEL: Lxray_sleds_start0
 ; CHECK:       .xword .Lxray_sled_0
 ; CHECK:       .xword .Lxray_sled_1
+; CHECK-LABEL: Lxray_sleds_end0
Index: test/CodeGen/AArch64/xray-tail-call-sled.ll
===================================================================
--- test/CodeGen/AArch64/xray-tail-call-sled.ll
+++ test/CodeGen/AArch64/xray-tail-call-sled.ll
@@ -28,21 +28,20 @@
 ; CHECK-NEXT:  ret
 }
 ; CHECK:       .p2align 4
-; CHECK-NEXT:  .xword .Lxray_synthetic_0
 ; CHECK-NEXT:  .xword .Lxray_fn_idx_synth_0
 ; CHECK-NEXT:  .section xray_instr_map,{{.*}}
-; CHECK-LABEL: Lxray_synthetic_0:
+; CHECK-LABEL: Lxray_sleds_start0:
 ; CHECK:       .xword .Lxray_sled_0
 ; CHECK:       .xword .Lxray_sled_1
-; CHECK-LABEL: Lxray_synthetic_end0:
+; CHECK-LABEL: Lxray_sleds_end0:
 ; CHECK:       .section xray_fn_idx,{{.*}}
 ; CHECK-LABEL: Lxray_fn_idx_synth_0:
-; CHECK:       .xword .Lxray_synthetic_0
-; CHECK-NEXT:  .xword .Lxray_synthetic_end0
+; CHECK:       .xword .Lxray_sleds_start0
+; CHECK-NEXT:  .xword .Lxray_sleds_end0
 
 define i32 @caller() nounwind noinline uwtable "function-instrument"="xray-always" {
 ; CHECK:       .p2align	2
-; CHECK-LABEL: .Lxray_sled_2:
+; CHECK-LABEL: Lxray_sled_2:
 ; CHECK-NEXT:  b	#32
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
@@ -53,7 +52,7 @@
 ; CHECK-NEXT:  nop
 ; CHECK-LABEL: .Ltmp2:
 ; CHECK:       .p2align	2
-; CHECK-LABEL: .Lxray_sled_3:
+; CHECK-LABEL: Lxray_sled_3:
 ; CHECK-NEXT:  b	#32
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  nop
@@ -68,14 +67,13 @@
   ret i32 %retval
 }
 ; CHECK:       .p2align 4
-; CHECK-NEXT:  .xword .Lxray_synthetic_1
 ; CHECK-NEXT:  .xword .Lxray_fn_idx_synth_1
 ; CHECK-NEXT:  .section xray_instr_map,{{.*}}
-; CHECK-LABEL: Lxray_synthetic_1:
+; CHECK-LABEL: Lxray_sleds_start1:
 ; CHECK:       .xword .Lxray_sled_2
 ; CHECK:       .xword .Lxray_sled_3
-; CHECK-LABEL: Lxray_synthetic_end1:
+; CHECK-LABEL: Lxray_sleds_end1:
 ; CHECK:       .section xray_fn_idx,{{.*}}
 ; CHECK-LABEL: Lxray_fn_idx_synth_1:
-; CHECK:       .xword .Lxray_synthetic_1
-; CHECK-NEXT:  .xword .Lxray_synthetic_end1
+; CHECK:       .xword .Lxray_sleds_start1
+; CHECK-NEXT:  .xword .Lxray_sleds_end1
Index: test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
===================================================================
--- test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -68,6 +68,10 @@
     ret void
   }
 
+  define amdgpu_kernel void @undefined_vreg_operand() {
+    unreachable
+  }
+
   declare i32 @llvm.amdgcn.workitem.id.x() #1
 
   attributes #0 = { nounwind }
@@ -856,3 +860,26 @@
     S_ENDPGM
 
 ...
+---
+# There is only an undef use operand for %1, so there is no
+# corresponding defining instruction
+
+# GCN-LABEL: name: undefined_vreg_operand{{$}}
+# GCN: bb.0
+# GCN-NEXT: FLAT_STORE_DWORD undef %3, undef %1,
+# GCN-NEXT: S_ENDPGM
+name: undefined_vreg_operand
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+  - { id: 1, class: vgpr_32, preferred-register: '' }
+  - { id: 2, class: vgpr_32, preferred-register: '' }
+  - { id: 3, class: vreg_64, preferred-register: '' }
+body:             |
+  bb.0:
+    %0 = V_MOV_B32_e32 0, implicit %exec
+    %2 = V_XOR_B32_e64 killed %0, undef %1, implicit %exec
+    FLAT_STORE_DWORD undef %3, %2, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+
+...
Index: test/CodeGen/AMDGPU/fold-operands-order.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/fold-operands-order.mir
@@ -0,0 +1,47 @@
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @mov_in_use_list_2x() {
+    unreachable
+  }
+
+...
+---
+
+# Blocks should be processed in program order to make sure folds
+# aren't made in users before the def is seen.
+
+# GCN-LABEL: name: mov_in_use_list_2x{{$}}
+# GCN: %2 = V_MOV_B32_e32 0, implicit %exec
+# GCN-NEXT: %3 = COPY undef %0
+
+# GCN: %1 = V_MOV_B32_e32 0, implicit %exec
+
+
+name: mov_in_use_list_2x
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+  - { id: 1, class: vgpr_32, preferred-register: '' }
+  - { id: 2, class: vgpr_32, preferred-register: '' }
+  - { id: 3, class: vgpr_32, preferred-register: '' }
+liveins:
+body:             |
+  bb.0:
+    successors: %bb.2
+
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+
+    %2 = COPY %1
+    %3 = V_XOR_B32_e64 killed %2, undef %0, implicit %exec
+
+  bb.2:
+    successors: %bb.1
+
+    %1 = V_MOV_B32_e32 0, implicit %exec
+    S_BRANCH %bb.1
+
+...
Index: test/CodeGen/AMDGPU/frame-index-elimination.ll
===================================================================
--- test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -6,9 +6,9 @@
 ; Materialize into a mov. Make sure there isn't an unnecessary copy.
 ; GCN-LABEL: {{^}}func_mov_fi_i32:
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN: s_sub_u32 vcc_hi, s5, s4
-; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], vcc_hi, 6
-; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]]
+; GCN: s_sub_u32 s6, s5, s4
+; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
+; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
 define void @func_mov_fi_i32() #0 {
@@ -22,9 +22,9 @@
 
 ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN: s_sub_u32 vcc_hi, s5, s4
-; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], vcc_hi, 6
-; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]]
+; GCN: s_sub_u32 s6, s5, s4
+; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
+; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -39,9 +39,9 @@
 ; into.
 
 ; GCN-LABEL: {{^}}func_other_fi_user_i32:
-; GCN: s_sub_u32 vcc_hi, s5, s4
-; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], vcc_hi, 6
-; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]]
+; GCN: s_sub_u32 s6, s5, s4
+; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
+; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
 ; GCN-NEXT: v_mul_lo_i32 v0, v0, 9
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -71,8 +71,8 @@
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
 ; GCN: s_waitcnt
-; GCN-NEXT: s_sub_u32 vcc_hi, s5, s4
-; GCN-NEXT: v_lshr_b32_e64 v0, vcc_hi, 6
+; GCN-NEXT: s_sub_u32 s6, s5, s4
+; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6
 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -99,8 +99,8 @@
 }
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
-; GCN: s_sub_u32 vcc_hi, s5, s4
-; GCN: v_lshr_b32_e64 v1, vcc_hi, 6
+; GCN: s_sub_u32 s6, s5, s4
+; GCN: v_lshr_b32_e64 v1, s6, 6
 ; GCN: s_and_saveexec_b64
 
 ; GCN: v_add_i32_e32 v0, vcc, 4, v1
@@ -123,10 +123,10 @@
 
 ; Added offset can't be used with VOP3 add
 ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32:
-; GCN: s_sub_u32 vcc_hi, s5, s4
-; GCN-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], vcc_hi, 6
-; GCN-DAG: s_movk_i32 vcc_hi, 0x204
-; GCN: v_add_i32_e32 v0, vcc, vcc_hi, [[SCALED]]
+; GCN: s_sub_u32 s6, s5, s4
+; GCN-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
+; GCN-DAG: s_movk_i32 s6, 0x204
+; GCN: v_add_i32_e64 v0, s[6:7], s6, [[SCALED]]
 ; GCN: v_mul_lo_i32 v0, v0, 9
 ; GCN: ds_write_b32 v0, v0
 define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
Index: test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
+++ test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
@@ -2,7 +2,7 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}test1:
-;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 glc slc
 define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -11,8 +11,38 @@
     ret void
 }
 
+;CHECK-LABEL: {{^}}test1_idx:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:32 glc slc
+define amdgpu_vs void @test1_idx(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test1_scalar_offset:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, {{s[0-9]+}} idxen offset:32 glc slc
+define amdgpu_vs void @test1_scalar_offset(i32 %a1, i32 %vaddr, i32 inreg %soffset) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 %soffset, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test1_no_glc_slc:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32
+define amdgpu_vs void @test1_no_glc_slc(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 0,
+        i32 0, i32 0)
+    ret void
+}
+
 ;CHECK-LABEL: {{^}}test2:
-;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 offen offset:24 glc slc
 define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -22,7 +52,7 @@
 }
 
 ;CHECK-LABEL: {{^}}test3:
-;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:11, nfmt:4, 0 offen offset:16 glc slc
 define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
@@ -32,7 +62,7 @@
 }
 
 ;CHECK-LABEL: {{^}}test4:
-;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:4, nfmt:4, 0 offen offset:8 glc slc
 define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) {
     call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
         i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
Index: test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
@@ -4,7 +4,7 @@
 declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8:
-; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
 ; GCN-DAG: v_mov_b32_e32 v5, v1
 ; GCN-DAG: v_mov_b32_e32 v4, v0
 define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
Index: test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
@@ -4,7 +4,7 @@
 declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8:
-; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
 ; GCN-DAG: v_mov_b32_e32 v5, v1
 ; GCN-DAG: v_mov_b32_e32 v4, v0
 define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
@@ -0,0 +1,109 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}tbuffer_load:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0
+; GCN: s_waitcnt
+define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
+main_body:
+    %vdata     = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 3, i1 1, i1 0)        
+    %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 1)        
+    %vdata_f32 = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
+    %vdata.f     = bitcast <4 x i32> %vdata to <4 x float>
+    %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
+    %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
+    %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
+    %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
+    %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
+    %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3
+    ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_immoffs:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42
+define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_immoffs_large
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1
+; GCN: s_waitcnt
+define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) {
+    %vdata     = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0)
+    %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 73, i32 14, i32 3, i1 0, i1 0)        
+    %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 1, i32 13, i32 4, i1 0, i1 0)        
+    %vdata.f     = bitcast <4 x i32> %vdata to <4 x float>
+    %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
+    %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
+    %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
+    %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
+    %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
+    ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_idx:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen
+define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_ofs:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen
+define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_ofs_imm:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52
+define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_both:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen
+define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+
+; GCN-LABEL: {{^}}buffer_load_xy:
+; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0
+define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
+    %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <2 x i32> %vdata to <2 x float>
+    ret <2 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}buffer_load_x:
+; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0
+define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) {
+    %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast i32 %vdata to float
+    ret float %vdata.f
+}
+
+declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
@@ -0,0 +1,110 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}tbuffer_store:
+; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0
+; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc
+; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc
+; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0
+define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  %in2 = bitcast <4 x float> %2 to <4 x i32>
+  %in3 = bitcast <4 x float> %3 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 2, i1 0, i1 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 3, i1 1, i1 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}tbuffer_store_immoffs:
+; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42
+define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs:
+; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42
+define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 %soffset, i32 42, i32 5, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_idx:
+; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 15, i32 2, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_ofs:
+; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 0, i32 3, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_both:
+; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 6, i32 4, i1 0, i1 0)
+  ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+; GCN-LABEL: {{^}}buffer_store_wait:
+; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen
+; GCN: s_waitcnt expcnt(0)
+; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
+; GCN: s_waitcnt vmcnt(0)
+; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:16, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0)
+  %data.i = bitcast <4 x float> %data to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 16, i32 2, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_x1:
+; GCN: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+main_body:
+  %data.i = bitcast float %data to i32
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 13, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_x2:
+; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) {
+main_body:
+  %data.i = bitcast <2 x float> %data to <2 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
Index: test/CodeGen/AMDGPU/merge-store-crash.ll
===================================================================
--- test/CodeGen/AMDGPU/merge-store-crash.ll
+++ test/CodeGen/AMDGPU/merge-store-crash.ll
@@ -26,11 +26,11 @@
   %tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp7, i32 1
   %tmp10 = insertelement <4 x i32> %tmp9, i32 undef, i32 2
   %tmp11 = insertelement <4 x i32> %tmp10, i32 undef, i32 3
-  call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %tmp11, i32 4, i32 undef, i32 %arg, i32 0, i32 14, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 undef, i32 0, i32 %arg, i32 0, i32 14, i32 4, i1 1, i1 1)
   ret void
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
 
 attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/merge-store-usedef.ll
===================================================================
--- test/CodeGen/AMDGPU/merge-store-usedef.ll
+++ test/CodeGen/AMDGPU/merge-store-usedef.ll
@@ -11,13 +11,13 @@
 
   store i32 %v, i32 addrspace(3)* %p0
 
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %v, i32 1, i32 undef, i32 undef, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i1 1, i1 0)
 
   %w = load i32, i32 addrspace(3)* %p0
   store i32 %w, i32 addrspace(3)* %p1
   ret void
 }
 
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
 
 attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/mubuf.ll
===================================================================
--- test/CodeGen/AMDGPU/mubuf.ll
+++ test/CodeGen/AMDGPU/mubuf.ll
@@ -62,7 +62,8 @@
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
@@ -80,7 +81,8 @@
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
@@ -175,6 +177,6 @@
 }
 
 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
 
 attributes #0 = { nounwind readonly }
Index: test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
@@ -0,0 +1,341 @@
+# RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies,si-fold-operands,dead-mi-elimination -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+
+# Check that constant is in SGPR registers
+
+# GCN-LABEL: {{^}}name: const_to_sgpr{{$}}
+# GCN:        %[[HI:[0-9]+]] = S_MOV_B32 0
+# GCN-NEXT:   %[[LO:[0-9]+]] = S_MOV_B32 1048576
+# GCN-NEXT:   %[[SGPR_PAIR:[0-9]+]] = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2
+# GCN-NEXT:   V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
+
+
+# GCN-LABEL: {{^}}name: const_to_sgpr_multiple_use{{$}}
+# GCN:        %[[HI:[0-9]+]] = S_MOV_B32 0
+# GCN-NEXT:   %[[LO:[0-9]+]] = S_MOV_B32 1048576
+# GCN-NEXT:   %[[SGPR_PAIR:[0-9]+]] = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2
+# GCN-NEXT:   V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
+# GCN-NEXT:   V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
+
+# GCN-LABEL: {{^}}name: const_to_sgpr_subreg{{$}}
+# GCN:       %[[OP0:[0-9]+]] = REG_SEQUENCE killed %{{[0-9]+}}, 1, killed %{{[0-9]+}}, 2
+# GCN-NEXT:  V_CMP_LT_U32_e64 killed %[[OP0]].sub0, 12, implicit %exec
+
+--- |
+  define amdgpu_kernel void @const_to_sgpr(i32 addrspace(1)* nocapture %arg, i64 %id) {
+  bb:
+    br i1 undef, label %bb1, label %bb2
+
+  bb1:                                              ; preds = %bb
+    br label %bb2
+
+  bb2:                                              ; preds = %bb1, %bb
+    ret void
+  }
+
+  define amdgpu_kernel void @const_to_sgpr_multiple_use(i32 addrspace(1)* nocapture %arg, i64 %id1, i64 %id2) {
+  bb:
+    br i1 undef, label %bb1, label %bb2
+
+  bb1:                                              ; preds = %bb
+    br label %bb2
+
+  bb2:                                              ; preds = %bb1, %bb
+    ret void
+  }
+
+  define amdgpu_kernel void @const_to_sgpr_subreg(i32 addrspace(1)* nocapture %arg, i64 %id) {
+  bb:
+    br i1 undef, label %bb1, label %bb2
+
+  bb1:                                              ; preds = %bb
+    br label %bb2
+
+  bb2:                                              ; preds = %bb1, %bb
+    ret void
+  }
+
+...
+---
+name:            const_to_sgpr
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_64 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sgpr_64 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sgpr_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_64_xexec }
+  - { id: 8, class: sreg_64_xexec }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sreg_32_xm0 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_32_xm0 }
+  - { id: 17, class: sreg_64 }
+  - { id: 18, class: sreg_32_xm0 }
+  - { id: 19, class: sreg_32_xm0 }
+  - { id: 20, class: sreg_64 }
+  - { id: 21, class: sreg_64 }
+  - { id: 22, class: vreg_64 }
+  - { id: 23, class: sreg_32_xm0 }
+  - { id: 24, class: sreg_64 }
+  - { id: 25, class: sreg_32_xm0 }
+  - { id: 26, class: sreg_32_xm0 }
+  - { id: 27, class: sgpr_64 }
+  - { id: 28, class: sgpr_128 }
+  - { id: 29, class: vgpr_32 }
+  - { id: 30, class: vreg_64 }
+liveins:
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' }
+body:             |
+  bb.0.bb:
+    successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000)
+    liveins: %vgpr0, %sgpr0_sgpr1
+
+    %3 = COPY %sgpr0_sgpr1
+    %2 = COPY %vgpr0
+    %7 = S_LOAD_DWORDX2_IMM %3, 9, 0
+    %8 = S_LOAD_DWORDX2_IMM %3, 11, 0
+    %6 = COPY %7
+    %9 = S_MOV_B32 0
+    %10 = REG_SEQUENCE %2, 1, killed %9, 2
+    %0 = COPY %10
+    %11 = COPY %10.sub0
+    %12 = COPY %10.sub1
+    %13 = COPY %8.sub0
+    %14 = COPY %8.sub1
+    %15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc
+    %16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc
+    %17 = REG_SEQUENCE killed %15, 1, killed %16, 2
+    %18 = S_MOV_B32 0
+    %19 = S_MOV_B32 1048576
+    %20 = REG_SEQUENCE killed %19, 1, killed %18, 2
+    %22 = COPY killed %20
+    %21 = V_CMP_LT_U64_e64 killed %17, %22, implicit %exec
+    %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_BRANCH %bb.1.bb1
+
+  bb.1.bb1:
+    successors: %bb.2.bb2(0x80000000)
+
+    %23 = S_MOV_B32 2
+    %24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc
+    %25 = S_MOV_B32 61440
+    %26 = S_MOV_B32 0
+    %27 = REG_SEQUENCE killed %26, 1, killed %25, 2
+    %28 = REG_SEQUENCE %6, 17, killed %27, 18
+    %29 = V_MOV_B32_e32 0, implicit %exec
+    %30 = COPY %24
+    BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, implicit %exec
+
+  bb.2.bb2:
+    SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_ENDPGM
+
+...
+---
+name:            const_to_sgpr_multiple_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_64 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sgpr_64 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sgpr_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_64_xexec }
+  - { id: 8, class: sreg_64_xexec }
+  - { id: 9, class: sreg_64_xexec }
+  - { id: 10, class: sreg_32 }
+  - { id: 11, class: sreg_64 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sreg_32_xm0 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_32_xm0 }
+  - { id: 17, class: sreg_32_xm0 }
+  - { id: 18, class: sreg_64 }
+  - { id: 19, class: sreg_32_xm0 }
+  - { id: 20, class: sreg_32_xm0 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32_xm0 }
+  - { id: 23, class: sreg_64 }
+  - { id: 24, class: sreg_32_xm0 }
+  - { id: 25, class: sreg_32_xm0 }
+  - { id: 26, class: sreg_64 }
+  - { id: 27, class: sreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: sreg_64 }
+  - { id: 30, class: vreg_64 }
+  - { id: 31, class: sreg_64 }
+  - { id: 32, class: sreg_32_xm0 }
+  - { id: 33, class: sreg_64 }
+  - { id: 34, class: sreg_32_xm0 }
+  - { id: 35, class: sreg_32_xm0 }
+  - { id: 36, class: sgpr_64 }
+  - { id: 37, class: sgpr_128 }
+  - { id: 38, class: vgpr_32 }
+  - { id: 39, class: vreg_64 }
+liveins:
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' }
+body:             |
+  bb.0.bb:
+    successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000)
+    liveins: %vgpr0, %sgpr0_sgpr1
+
+    %3 = COPY %sgpr0_sgpr1
+    %2 = COPY %vgpr0
+    %7 = S_LOAD_DWORDX2_IMM %3, 9, 0
+    %8 = S_LOAD_DWORDX2_IMM %3, 11, 0
+    %9 = S_LOAD_DWORDX2_IMM %3, 13, 0
+    %6 = COPY %7
+    %10 = S_MOV_B32 0
+    %11 = REG_SEQUENCE %2, 1, killed %10, 2
+    %0 = COPY %11
+    %12 = COPY %11.sub0
+    %13 = COPY %11.sub1
+    %14 = COPY %8.sub0
+    %15 = COPY %8.sub1
+    %16 = S_ADD_U32 %12, killed %14, implicit-def %scc
+    %17 = S_ADDC_U32 %13, killed %15, implicit-def dead %scc, implicit %scc
+    %18 = REG_SEQUENCE killed %16, 1, killed %17, 2
+    %19 = COPY %9.sub0
+    %20 = COPY %9.sub1
+    %21 = S_ADD_U32 %12, killed %19, implicit-def %scc
+    %22 = S_ADDC_U32 %13, killed %20, implicit-def dead %scc, implicit %scc
+    %23 = REG_SEQUENCE killed %21, 1, killed %22, 2
+    %24 = S_MOV_B32 0
+    %25 = S_MOV_B32 1048576
+    %26 = REG_SEQUENCE killed %25, 1, killed %24, 2
+    %28 = COPY %26
+    %27 = V_CMP_LT_U64_e64 killed %18, %28, implicit %exec
+    %29 = V_CMP_LT_U64_e64 killed %23, %28, implicit %exec
+    %31 = S_AND_B64 killed %27, killed %29, implicit-def dead %scc
+    %1 = SI_IF killed %31, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_BRANCH %bb.1.bb1
+
+  bb.1.bb1:
+    successors: %bb.2.bb2(0x80000000)
+
+    %32 = S_MOV_B32 2
+    %33 = S_LSHL_B64 %0, killed %32, implicit-def dead %scc
+    %34 = S_MOV_B32 61440
+    %35 = S_MOV_B32 0
+    %36 = REG_SEQUENCE killed %35, 1, killed %34, 2
+    %37 = REG_SEQUENCE %6, 17, killed %36, 18
+    %38 = V_MOV_B32_e32 0, implicit %exec
+    %39 = COPY %33
+    BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, implicit %exec
+
+  bb.2.bb2:
+    SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_ENDPGM
+
+...
+---
+name:            const_to_sgpr_subreg
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_64 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sgpr_64 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sgpr_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_64_xexec }
+  - { id: 8, class: sreg_64_xexec }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sreg_32_xm0 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_32_xm0 }
+  - { id: 17, class: sreg_64 }
+  - { id: 18, class: sreg_32_xm0 }
+  - { id: 19, class: sreg_32_xm0 }
+  - { id: 20, class: sreg_64 }
+  - { id: 21, class: sreg_64 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: sreg_32_xm0 }
+  - { id: 24, class: sreg_64 }
+  - { id: 25, class: sreg_32_xm0 }
+  - { id: 26, class: sreg_32_xm0 }
+  - { id: 27, class: sgpr_64 }
+  - { id: 28, class: sgpr_128 }
+  - { id: 29, class: vgpr_32 }
+  - { id: 30, class: vreg_64 }
+liveins:
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' }
+body:             |
+  bb.0.bb:
+    successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000)
+    liveins: %vgpr0, %sgpr0_sgpr1
+
+    %3 = COPY %sgpr0_sgpr1
+    %2 = COPY %vgpr0
+    %7 = S_LOAD_DWORDX2_IMM %3, 9, 0
+    %8 = S_LOAD_DWORDX2_IMM %3, 11, 0
+    %6 = COPY %7
+    %9 = S_MOV_B32 0
+    %10 = REG_SEQUENCE %2, 1, killed %9, 2
+    %0 = COPY %10
+    %11 = COPY %10.sub0
+    %12 = COPY %10.sub1
+    %13 = COPY %8.sub0
+    %14 = COPY %8.sub1
+    %15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc
+    %16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc
+    %17 = REG_SEQUENCE killed %15, 1, killed %16, 2
+    %18 = S_MOV_B32 12
+    %19 = S_MOV_B32 1048576
+    %20 = REG_SEQUENCE killed %19, 1, killed %18, 2
+    %22 = COPY killed %20.sub1
+    %21 = V_CMP_LT_U32_e64 killed %17.sub0, %22, implicit %exec
+    %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_BRANCH %bb.1.bb1
+
+  bb.1.bb1:
+    successors: %bb.2.bb2(0x80000000)
+
+    %23 = S_MOV_B32 2
+    %24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc
+    %25 = S_MOV_B32 61440
+    %26 = S_MOV_B32 0
+    %27 = REG_SEQUENCE killed %26, 1, killed %25, 2
+    %28 = REG_SEQUENCE %6, 17, killed %27, 18
+    %29 = V_MOV_B32_e32 0, implicit %exec
+    %30 = COPY %24
+    BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, implicit %exec
+
+  bb.2.bb2:
+    SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_ENDPGM
+
+...
Index: test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
===================================================================
--- test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
+++ test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
@@ -25,29 +25,29 @@
   %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2
   %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp3, i32 1, i32 36, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1)
   %bc = bitcast <4 x float> %array_vector3 to <4 x i32>
   %tmp4 = extractelement <4 x i32> %bc, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp4, i32 1, i32 48, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 48, i32 4, i32 4, i1 1, i1 1)
   %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32>
   %tmp5 = extractelement <4 x i32> %bc49, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp5, i32 1, i32 72, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 72, i32 4, i32 4, i1 1, i1 1)
   %array_vector21 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp, i32 1
   %array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2
   %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 28, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 28, i32 4, i32 4, i1 1, i1 1)
   %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32>
   %tmp6 = extractelement <4 x i32> %bc52, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp6, i32 1, i32 64, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 20, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 56, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 92, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 64, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 20, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 56, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 92, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3
 
 attributes #0 = { nounwind "target-cpu"="tonga" }
 attributes #1 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/sdwa-peephole.ll
===================================================================
--- test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -400,9 +400,9 @@
 
 ; Check that "pulling out" SDWA operands works correctly.
 ; GCN-LABEL: {{^}}pulled_out_test:
-; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-NOT: v_and_b32_sdwa
Index: test/CodeGen/AMDGPU/shrink-carry.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/shrink-carry.mir
@@ -0,0 +1,101 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-insert-skips -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: subbrev{{$}}
+# GCN:       V_SUBBREV_U32_e64 0, undef %vgpr0, killed %vcc, implicit %exec
+
+---
+name:            subbrev
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = IMPLICIT_DEF
+    %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec
+    %4, %5 = V_SUBBREV_U32_e64 0, %0, %3, implicit %exec
+    S_ENDPGM
+
+...
+
+# GCN-LABEL: name: subb{{$}}
+# GCN:       V_SUBB_U32_e64 undef %vgpr0, 0, killed %vcc, implicit %exec
+
+---
+name:            subb
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = IMPLICIT_DEF
+    %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec
+    %4, %5 = V_SUBB_U32_e64 %0, 0, %3, implicit %exec
+    S_ENDPGM
+
+...
+
+# GCN-LABEL: name: addc{{$}}
+# GCN:       V_ADDC_U32_e32 0, undef %vgpr0, implicit-def %vcc, implicit killed %vcc, implicit %exec
+
+---
+name:            addc
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = IMPLICIT_DEF
+    %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec
+    %4, %5 = V_ADDC_U32_e64 0, %0, %3, implicit %exec
+    S_ENDPGM
+
+...
+
+# GCN-LABEL: name: addc2{{$}}
+# GCN:       V_ADDC_U32_e32 0, undef %vgpr0, implicit-def %vcc, implicit killed %vcc, implicit %exec
+
+---
+name:            addc2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = IMPLICIT_DEF
+    %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec
+    %4, %5 = V_ADDC_U32_e64 %0, 0, %3, implicit %exec
+    S_ENDPGM
+
+...
Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
===================================================================
--- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
 
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
 declare void @llvm.amdgcn.s.barrier() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
@@ -258,9 +258,8 @@
 ;   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
 
 ;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-;   call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
-;         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
-;         i32 1, i32 0)
+;   call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef,
+;         i32 %vaddr, i32 0, i32 0, i32 32, i32 14, i32 4, i1 1, i1 1)
 
 ;   %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
 
Index: test/CodeGen/AMDGPU/sint_to_fp.i64.ll
===================================================================
--- test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -22,7 +22,7 @@
 ; GCN: v_cndmask
 
 ; GCN-DAG: v_cmp_eq_u64
-; GCN-DAG: v_cmp_lt_u64
+; GCN-DAG: v_cmp_gt_u64
 
 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
@@ -57,7 +57,7 @@
 ; GCN: v_cndmask
 
 ; GCN-DAG: v_cmp_eq_u64
-; GCN-DAG: v_cmp_lt_u64
+; GCN-DAG: v_cmp_gt_u64
 
 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
Index: test/CodeGen/AMDGPU/spill-m0.ll
===================================================================
--- test/CodeGen/AMDGPU/spill-m0.ll
+++ test/CodeGen/AMDGPU/spill-m0.ll
@@ -119,10 +119,10 @@
 
 ; GCN: ; clobber m0
 
-; TOSMEM: s_mov_b32 vcc_hi, m0
+; TOSMEM: s_mov_b32 s2, m0
 ; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
-; TOSMEM: s_mov_b32 m0, vcc_hi
+; TOSMEM: s_mov_b32 m0, s2
 
 ; TOSMEM: s_mov_b64 exec,
 ; TOSMEM: s_cbranch_execz
@@ -170,10 +170,10 @@
 
 ; TOSMEM: s_mov_b32 m0, -1
 
-; TOSMEM: s_mov_b32 vcc_hi, m0
+; TOSMEM: s_mov_b32 s0, m0
 ; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload
-; TOSMEM: s_mov_b32 m0, vcc_hi
+; TOSMEM: s_mov_b32 m0, s0
 ; TOSMEM: s_waitcnt lgkmcnt(0)
 
 ; TOSMEM: ds_write_b64
Index: test/CodeGen/AMDGPU/uint_to_fp.i64.ll
===================================================================
--- test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -19,7 +19,7 @@
 ; GCN: v_cndmask
 
 ; GCN-DAG: v_cmp_eq_u64
-; GCN-DAG: v_cmp_lt_u64
+; GCN-DAG: v_cmp_gt_u64
 
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]]
@@ -50,7 +50,7 @@
 ; GCN: v_cndmask
 
 ; GCN-DAG: v_cmp_eq_u64
-; GCN-DAG: v_cmp_lt_u64
+; GCN-DAG: v_cmp_gt_u64
 
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]]
Index: test/CodeGen/ARM/v6m-umul-with-overflow.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/v6m-umul-with-overflow.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=thumbv6m-none-eabi | FileCheck %s
+
+define i1 @unsigned_multiplication_did_overflow(i32, i32) {
+; CHECK-LABEL: unsigned_multiplication_did_overflow:
+entry-block:
+  %2 = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %0, i32 %1)
+  %3 = extractvalue { i32, i1 } %2, 1
+  ret i1 %3
+
+; CHECK: mov{{s?}}    r2, r1
+; CHECK: mov{{s?}}    r1, #0
+; CHECK: mov{{s?}}    r3, {{#0|r1}}
+; CHECK: bl     __aeabi_lmul
+}
+
+declare { i32, i1 } @llvm.umul.with.overflow.i32(i32, i32)
Index: test/CodeGen/ARM/xray-armv6-attribute-instrumentation.ll
===================================================================
--- test/CodeGen/ARM/xray-armv6-attribute-instrumentation.ll
+++ test/CodeGen/ARM/xray-armv6-attribute-instrumentation.ll
@@ -24,14 +24,13 @@
 ; CHECK-NEXT:  bx	lr
 }
 ; CHECK:       .p2align 4
-; CHECK-NEXT:  .long {{.*}}Lxray_synthetic_0
 ; CHECK-NEXT:  .long {{.*}}Lxray_fn_idx_synth_0
 ; CHECK-NEXT:  .section {{.*}}xray_instr_map{{.*}}
-; CHECK-LABEL: Lxray_synthetic_0:
+; CHECK-LABEL: Lxray_sleds_start0:
 ; CHECK:       .long {{.*}}Lxray_sled_0
 ; CHECK:       .long {{.*}}Lxray_sled_1
-; CHECK-LABEL: Lxray_synthetic_end0:
+; CHECK-LABEL: Lxray_sleds_end0:
 ; CHECK:       .section {{.*}}xray_fn_idx{{.*}}
 ; CHECK-LABEL: Lxray_fn_idx_synth_0:
-; CHECK:       .long {{.*}}Lxray_synthetic_0
-; CHECK-NEXT:  .long {{.*}}Lxray_synthetic_end0
+; CHECK:       .long {{.*}}Lxray_sleds_start0
+; CHECK-NEXT:  .long {{.*}}Lxray_sleds_end0
Index: test/CodeGen/ARM/xray-armv7-attribute-instrumentation.ll
===================================================================
--- test/CodeGen/ARM/xray-armv7-attribute-instrumentation.ll
+++ test/CodeGen/ARM/xray-armv7-attribute-instrumentation.ll
@@ -24,15 +24,14 @@
 ; CHECK-NEXT:  bx lr
 }
 ; CHECK:       .p2align 4
-; CHECK-NEXT:  .long {{.*}}Lxray_synthetic_0
 ; CHECK-NEXT:  .long {{.*}}Lxray_fn_idx_synth_0
 ; CHECK-NEXT:  .section {{.*}}xray_instr_map{{.*}}
-; CHECK-LABEL: Lxray_synthetic_0:
+; CHECK-LABEL: Lxray_sleds_start0:
 ; CHECK:       .long {{.*}}Lxray_sled_0
 ; CHECK:       .long {{.*}}Lxray_sled_1
-; CHECK-LABEL: Lxray_synthetic_end0:
+; CHECK-LABEL: Lxray_sleds_end0:
 ; CHECK:       .section {{.*}}xray_fn_idx{{.*}}
 ; CHECK-LABEL: Lxray_fn_idx_synth_0:
-; CHECK:       .long {{.*}}xray_synthetic_0
-; CHECK-NEXT:  .long {{.*}}xray_synthetic_end0
+; CHECK:       .long {{.*}}xray_sleds_start0
+; CHECK-NEXT:  .long {{.*}}xray_sleds_end0
 
Index: test/CodeGen/PowerPC/2010-02-12-saveCR.ll
===================================================================
--- test/CodeGen/PowerPC/2010-02-12-saveCR.ll
+++ test/CodeGen/PowerPC/2010-02-12-saveCR.ll
@@ -8,15 +8,15 @@
 ; Note that part of what is being checked here is proper register reuse.
 ; CHECK: mfcr [[T1:r[0-9]+]]                         ; cr2
 ; CHECK: lis [[T2:r[0-9]+]], 1
-; CHECK: addi r3, r1, 72
 ; CHECK: rotlwi [[T1]], [[T1]], 8
 ; CHECK: ori [[T2]], [[T2]], 34540
 ; CHECK: stwx [[T1]], r1, [[T2]]
-; CHECK: lis [[T3:r[0-9]+]], 1
 ; CHECK: mfcr [[T4:r[0-9]+]]                         ; cr3
-; CHECK: ori [[T3]], [[T3]], 34536
+; CHECK: lis [[T3:r[0-9]+]], 1
 ; CHECK: rotlwi [[T4]], [[T4]], 12
+; CHECK: ori [[T3]], [[T3]], 34536
 ; CHECK: stwx [[T4]], r1, [[T3]]
+; CHECK: addi r3, r1, 72
   %x = alloca [100000 x i8]                       ; <[100000 x i8]*> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
   %x1 = bitcast [100000 x i8]* %x to i8*          ; <i8*> [#uses=1]
Index: test/CodeGen/PowerPC/vsx-spill.ll
===================================================================
--- test/CodeGen/PowerPC/vsx-spill.ll
+++ test/CodeGen/PowerPC/vsx-spill.ll
@@ -23,9 +23,9 @@
 ; CHECK-REG: blr
 
 ; CHECK-FISL: @foo1
-; CHECK-FISL: lis 0, -1
-; CHECK-FISL: ori 0, 0, 65384
-; CHECK-FISL: stxsdx 1, 1, 0
+; CHECK-FISL: lis 3, -1
+; CHECK-FISL: ori 3, 3, 65384
+; CHECK-FISL: stxsdx 1, 1, 3
 ; CHECK-FISL: blr
 
 ; CHECK-P9-REG: @foo1
@@ -54,8 +54,8 @@
 
 ; CHECK-FISL: @foo2
 ; CHECK-FISL: xsadddp [[R1:[0-9]+]], 1, 1
-; CHECK-FISL: stxsdx [[R1]], [[R1]], 0
-; CHECK-FISL: lxsdx [[R1]], [[R1]], 0
+; CHECK-FISL: stxsdx [[R1]], [[R1]], 3
+; CHECK-FISL: lxsdx [[R1]], [[R1]], 3
 ; CHECK-FISL: blr
 
 ; CHECK-P9-REG: @foo2
Index: test/CodeGen/PowerPC/vsx.ll
===================================================================
--- test/CodeGen/PowerPC/vsx.ll
+++ test/CodeGen/PowerPC/vsx.ll
@@ -235,9 +235,9 @@
 ; CHECK-FISL-LABEL: @test14
 ; CHECK-FISL: xxlor 0, 34, 35
 ; CHECK-FISL: xxlnor 34, 34, 35
-; CHECK-FISL: lis 0, -1
-; CHECK-FISL: ori 0, 0, 65520
-; CHECK-FISL: stxvd2x 0, 1, 0
+; CHECK-FISL: lis 3, -1
+; CHECK-FISL: ori 3, 3, 65520
+; CHECK-FISL: stxvd2x 0, 1, 3
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test14
@@ -260,9 +260,9 @@
 ; CHECK-FISL: xxlor 36, 0, 0
 ; CHECK-FISL: xxlnor 0, 34, 35
 ; CHECK-FISL: xxlor 34, 0, 0
-; CHECK-FISL: lis 0, -1
-; CHECK-FISL: ori 0, 0, 65520
-; CHECK-FISL: stxvd2x 36, 1, 0
+; CHECK-FISL: lis 3, -1
+; CHECK-FISL: ori 3, 3, 65520
+; CHECK-FISL: stxvd2x 36, 1, 3
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test15
@@ -285,9 +285,9 @@
 ; CHECK-FISL: xxlor 36, 0, 0
 ; CHECK-FISL: xxlnor 0, 34, 35
 ; CHECK-FISL: xxlor 34, 0, 0
-; CHECK-FISL: lis 0, -1
-; CHECK-FISL: ori 0, 0, 65520
-; CHECK-FISL: stxvd2x 36, 1, 0
+; CHECK-FISL: lis 3, -1
+; CHECK-FISL: ori 3, 3, 65520
+; CHECK-FISL: stxvd2x 36, 1, 3
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test16
@@ -330,9 +330,9 @@
 ; CHECK-FISL: xxlor 36, 0, 0
 ; CHECK-FISL: xxlandc 0, 34, 35
 ; CHECK-FISL: xxlor 34, 0, 0
-; CHECK-FISL: lis 0, -1
-; CHECK-FISL: ori 0, 0, 65520
-; CHECK-FISL: stxvd2x 36, 1, 0
+; CHECK-FISL: lis 3, -1
+; CHECK-FISL: ori 3, 3, 65520
+; CHECK-FISL: stxvd2x 36, 1, 3
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test18
@@ -355,9 +355,9 @@
 ; CHECK-FISL: xxlor 36, 0, 0
 ; CHECK-FISL: xxlandc 0, 34, 35
 ; CHECK-FISL: xxlor 34, 0, 0
-; CHECK-FISL: lis 0, -1
-; CHECK-FISL: ori 0, 0, 65520
-; CHECK-FISL: stxvd2x 36, 1, 0
+; CHECK-FISL: lis 3, -1
+; CHECK-FISL: ori 3, 3, 65520
+; CHECK-FISL: stxvd2x 36, 1, 3
 ; CHECK-FISL: blr
 
 ; CHECK-LE-LABEL: @test19
Index: test/CodeGen/X86/clear_upper_vector_element_bits.ll
===================================================================
--- test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -714,22 +714,13 @@
 ;
 ; AVX1-LABEL: _clearupper8xi32b:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: _clearupper8xi32b:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX2-NEXT:    retq
   %x16 = bitcast <8 x i32> %0 to <16 x i16>
   %r0 = insertelement <16 x i16> %x16, i16 zeroinitializer, i32 1
Index: test/CodeGen/X86/insertelement-zero.ll
===================================================================
--- test/CodeGen/X86/insertelement-zero.ll
+++ test/CodeGen/X86/insertelement-zero.ll
@@ -405,25 +405,10 @@
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: insert_v16i16_z12345z789ABCDEz:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: insert_v16i16_z12345z789ABCDEz:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: insert_v16i16_z12345z789ABCDEz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    retq
   %1 = insertelement <16 x i16> %a, i16 0, i32 0
   %2 = insertelement <16 x i16> %1, i16 0, i32 6
   %3 = insertelement <16 x i16> %2, i16 0, i32 15
Index: test/CodeGen/X86/memcmp.ll
===================================================================
--- test/CodeGen/X86/memcmp.ll
+++ test/CodeGen/X86/memcmp.ll
@@ -12,19 +12,46 @@
 
 define i32 @length2(i8* %X, i8* %Y) nounwind {
 ; X32-LABEL: length2:
-; X32:       # BB#0:
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $2
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll memcmp
-; X32-NEXT:    addl $16, %esp
+; X32:       # BB#0: # %loadbb
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzwl (%ecx), %ecx
+; X32-NEXT:    movzwl (%eax), %eax
+; X32-NEXT:    rolw $8, %cx
+; X32-NEXT:    rolw $8, %ax
+; X32-NEXT:    movzwl %cx, %ecx
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    cmpl %eax, %ecx
+; X32-NEXT:    je .LBB0_1
+; X32-NEXT:  # BB#2: # %res_block
+; X32-NEXT:    movl $-1, %eax
+; X32-NEXT:    jb .LBB0_4
+; X32-NEXT:  # BB#3: # %res_block
+; X32-NEXT:    movl $1, %eax
+; X32-NEXT:  .LBB0_4: # %endblock
+; X32-NEXT:    retl
+; X32-NEXT:  .LBB0_1:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: length2:
-; X64:       # BB#0:
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # BB#2: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
   ret i32 %m
 }
@@ -145,19 +172,42 @@
 
 define i32 @length4(i8* %X, i8* %Y) nounwind {
 ; X32-LABEL: length4:
-; X32:       # BB#0:
-; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl $4
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    pushl {{[0-9]+}}(%esp)
-; X32-NEXT:    calll memcmp
-; X32-NEXT:    addl $16, %esp
+; X32:       # BB#0: # %loadbb
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    bswapl %ecx
+; X32-NEXT:    bswapl %eax
+; X32-NEXT:    cmpl %eax, %ecx
+; X32-NEXT:    je .LBB6_1
+; X32-NEXT:  # BB#2: # %res_block
+; X32-NEXT:    movl $-1, %eax
+; X32-NEXT:    jb .LBB6_4
+; X32-NEXT:  # BB#3: # %res_block
+; X32-NEXT:    movl $1, %eax
+; X32-NEXT:  .LBB6_4: # %endblock
+; X32-NEXT:    retl
+; X32-NEXT:  .LBB6_1:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: length4:
-; X64:       # BB#0:
-; X64-NEXT:    movl $4, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB6_1
+; X64-NEXT:  # BB#2: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB6_1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
   ret i32 %m
 }
@@ -259,9 +309,21 @@
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: length8:
-; X64:       # BB#0:
-; X64-NEXT:    movl $8, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB11_1
+; X64-NEXT:  # BB#2: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB11_1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
   ret i32 %m
 }
Index: test/CodeGen/X86/non-value-mem-operand.mir
===================================================================
--- /dev/null
+++ test/CodeGen/X86/non-value-mem-operand.mir
@@ -0,0 +1,293 @@
+# RUN: llc  -run-pass implicit-null-checks -mtriple=x86_64-apple-macosx -o - %s | FileCheck %s
+
+# CHECK-NOT: FAULTING_OP
+
+--- |
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  @global = external global i8*
+  @global.1 = external global i8*
+  
+  declare i8* @ham(i8*, i8**)
+  
+  define void @eggs(i8* %arg) gc "statepoint-example" {
+  bb:
+    %tmp = call i8* undef(i8* undef, i8** undef)
+    %tmp1 = icmp eq i8* %tmp, null
+    br i1 %tmp1, label %bb2, label %bb3, !make.implicit !0
+  
+  bb2:                                              ; preds = %bb
+    br i1 undef, label %bb51, label %bb59
+  
+  bb3:                                              ; preds = %bb
+    %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 16
+    %tmp5 = bitcast i8* %tmp4 to i64*
+    br label %bb7
+  
+  bb7:                                              ; preds = %bb37, %bb3
+    %tmp8 = phi i64* [ %tmp5, %bb3 ], [ %tmp18, %bb37 ]
+    %tmp10 = phi i32 [ undef, %bb3 ], [ %tmp48, %bb37 ]
+    %tmp12 = phi i32 [ 0, %bb3 ], [ 6, %bb37 ]
+    %tmp13 = phi double [ 0.000000e+00, %bb3 ], [ 2.000000e+00, %bb37 ]
+    %tmp14 = zext i32 %tmp10 to i64
+    br i1 undef, label %bb26, label %bb15
+  
+  bb15:                                             ; preds = %bb7
+    %tmp16 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* nonnull @wibble, i32 0, i32 0, i32 0, i32 30, i32 1, i32 0, i32 99, i32 0, i32 12, i32 0, i32 10, i32 %tmp10, i32 10, i32 0, i32 10, i32 %tmp12, i32 10, i32 undef, i32 6, float undef, i32 7, double %tmp13, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* %tmp, i32 7, double undef, i32 99, i8* null, i8* undef)
+    br label %bb26
+  
+  bb26:                                             ; preds = %bb15, %bb7
+    %tmp18 = phi i64* [ %tmp8, %bb7 ], [ undef, %bb15 ]
+    %tmp20 = sub i32 0, 0
+    %tmp21 = select i1 undef, i32 0, i32 %tmp20
+    %tmp22 = sext i32 %tmp21 to i64
+    %tmp23 = load i8*, i8** @global.1, align 8
+    %tmp24 = icmp eq i8* %tmp23, null
+    %tmp25 = select i1 %tmp24, i8* null, i8* undef
+    %tmp27 = load i32, i32* undef, align 4
+    %sunkaddr = mul i64 %tmp14, 8
+    %tmp2 = bitcast i64* %tmp18 to i8*
+    %sunkaddr1 = getelementptr i8, i8* %tmp2, i64 %sunkaddr
+    %tmp3 = bitcast i8* %sunkaddr1 to i64*
+    %tmp28 = load i64, i64* %tmp3, align 8
+    %tmp29 = add i64 %tmp28, 1
+    store i64 %tmp29, i64* %tmp3, align 8
+    %tmp30 = trunc i64 %tmp28 to i32
+    %tmp31 = sub i32 %tmp27, %tmp30
+    store i32 %tmp31, i32* undef, align 4
+    %tmp32 = getelementptr inbounds i8, i8* %tmp25, i64 768
+    %tmp33 = bitcast i8* %tmp32 to i64*
+    %tmp34 = load i64, i64* %tmp33, align 8
+    br i1 undef, label %bb37, label %bb35
+  
+  bb35:                                             ; preds = %bb26
+    %tmp36 = call i8* @ham(i8* undef, i8** nonnull @global)
+    br label %bb37
+  
+  bb37:                                             ; preds = %bb35, %bb26
+    %tmp38 = phi i8* [ %tmp36, %bb35 ], [ undef, %bb26 ]
+    %tmp39 = getelementptr inbounds i8, i8* %tmp38, i64 760
+    %tmp40 = bitcast i8* %tmp39 to i64*
+    %tmp41 = load i64, i64* %tmp40, align 8
+    %tmp42 = icmp slt i64 %tmp34, %tmp41
+    %tmp43 = select i1 %tmp42, i64 %tmp41, i64 %tmp34
+    %tmp44 = and i64 %tmp43, 63
+    %tmp45 = ashr i64 %tmp29, %tmp44
+    %sunkaddr2 = mul i64 %tmp14, 8
+    %tmp6 = bitcast i64* %tmp18 to i8*
+    %sunkaddr3 = getelementptr i8, i8* %tmp6, i64 %sunkaddr2
+    %tmp7 = bitcast i8* %sunkaddr3 to i64*
+    store i64 %tmp45, i64* %tmp7, align 8
+    %tmp46 = sub i64 0, %tmp22
+    store i64 %tmp46, i64* undef, align 8
+    %tmp47 = add nsw i32 %tmp12, 1
+    %tmp48 = add i32 %tmp10, 1
+    %tmp49 = icmp sgt i32 %tmp48, 15140
+    br i1 %tmp49, label %bb51.loopexit, label %bb7
+  
+  bb51.loopexit:                                    ; preds = %bb37
+    %tmp9 = add i32 %tmp10, 1
+    br label %bb51
+  
+  bb51:                                             ; preds = %bb51.loopexit, %bb2
+    %tmp52 = phi i32 [ %tmp47, %bb51.loopexit ], [ 0, %bb2 ]
+    %tmp53 = phi double [ 2.000000e+00, %bb51.loopexit ], [ 0.000000e+00, %bb2 ]
+    %tmp54 = phi i32 [ %tmp9, %bb51.loopexit ], [ undef, %bb2 ]
+    %tmp56 = add i32 %tmp54, 0
+    %tmp57 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull @wobble, i32 1, i32 0, i32 -121, i32 0, i32 38, i32 1, i32 0, i32 270, i32 4, i32 12, i32 0, i32 11, i64 undef, i32 99, i8* null, i32 10, i32 %tmp56, i32 6, float undef, i32 99, i8* null, i32 99, i8* null, i32 10, i32 %tmp52, i32 10, i32 undef, i32 99, i8* null, i32 7, double %tmp53, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* undef, i32 99, i8* null, i32 99, i8* null, i8* undef)
+    unreachable
+  
+  bb59:                                             ; preds = %bb2
+    %tmp61 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull @wobble, i32 1, i32 0, i32 8, i32 0, i32 38, i32 1, i32 0, i32 123, i32 4, i32 12, i32 0, i32 13, i8* null, i32 99, i32 undef, i32 13, i8* null, i32 10, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i8* null, i32 99, float undef, i32 99, double undef, i32 99, i8* null, i32 99, double undef, i32 99, i8* null, i32 13, i8* null, i32 99, double undef, i32 99, i8* null)
+    unreachable
+  }
+  
+  declare void @wibble()
+  
+  declare void @wobble(i32)
+  
+  declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...)
+  
+  declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #0
+  
+  attributes #0 = { nounwind }
+  
+  !0 = !{}
+...
+---
+name:            eggs
+alignment:       4
+tracksRegLiveness: true
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -56, size: 8, alignment: 8, callee-saved-register: '%rbx' }
+  - { id: 1, type: spill-slot, offset: -48, size: 8, alignment: 16, callee-saved-register: '%r12' }
+  - { id: 2, type: spill-slot, offset: -40, size: 8, alignment: 8, callee-saved-register: '%r13' }
+  - { id: 3, type: spill-slot, offset: -32, size: 8, alignment: 16, callee-saved-register: '%r14' }
+  - { id: 4, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%r15' }
+  - { id: 5, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%rbp' }
+stack:           
+  - { id: 0, offset: -88, size: 8, alignment: 8 }
+  - { id: 1, offset: -96, size: 8, alignment: 8 }
+  - { id: 2, offset: -104, size: 8, alignment: 8 }
+  - { id: 3, offset: -64, size: 8, alignment: 8 }
+  - { id: 4, type: spill-slot, offset: -72, size: 8, alignment: 8 }
+  - { id: 5, type: spill-slot, offset: -80, size: 8, alignment: 8 }
+constants:       
+  - id:              0
+    value:           'double 2.000000e+00'
+    alignment:       8
+body:             |
+  bb.0.bb:
+    successors: %bb.1.bb2(0x00000800), %bb.3.bb3(0x7ffff800)
+    liveins: %rbp, %r15, %r14, %r13, %r12, %rbx
+  
+    frame-setup PUSH64r killed %rbp, implicit-def %rsp, implicit %rsp
+    frame-setup PUSH64r killed %r15, implicit-def %rsp, implicit %rsp
+    frame-setup PUSH64r killed %r14, implicit-def %rsp, implicit %rsp
+    frame-setup PUSH64r killed %r13, implicit-def %rsp, implicit %rsp
+    frame-setup PUSH64r killed %r12, implicit-def %rsp, implicit %rsp
+    frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+    %rsp = frame-setup SUB64ri8 %rsp, 56, implicit-def dead %eflags
+    CALL64r undef %rax, csr_64, implicit %rsp, implicit undef %rdi, implicit undef %rsi, implicit-def %rsp, implicit-def %rax
+    TEST64rr %rax, %rax, implicit-def %eflags
+    JNE_1 %bb.3.bb3, implicit killed %eflags
+  
+  bb.1.bb2:
+    successors: %bb.2(0x40000000), %bb.13.bb59(0x40000000)
+  
+    %ebp = XOR32rr undef %ebp, undef %ebp, implicit-def dead %eflags
+    TEST8rr %bpl, %bpl, implicit-def %eflags
+    JE_1 %bb.13.bb59, implicit killed %eflags
+  
+  bb.2:
+    successors: %bb.12.bb51(0x80000000)
+    liveins: %ebp
+  
+    %xmm0 = XORPSrr undef %xmm0, undef %xmm0
+    %ebx = IMPLICIT_DEF implicit-def %rbx
+    JMP_1 %bb.12.bb51
+  
+  bb.3.bb3:
+    successors: %bb.4.bb7(0x80000000)
+    liveins: %rax
+  
+    MOV64mr %rsp, 1, _, 32, _, %rax :: (store 8 into %stack.5)
+    %r12 = MOV64rr killed %rax
+    %r12 = ADD64ri8 killed %r12, 16, implicit-def dead %eflags
+    %xmm0 = XORPSrr undef %xmm0, undef %xmm0
+    %esi = XOR32rr undef %esi, undef %esi, implicit-def dead %eflags
+    %rax = MOV64ri %const.0
+    %xmm1 = MOVSDrm killed %rax, 1, _, 0, _ :: (load 8 from constant-pool)
+    MOVSDmr %rsp, 1, _, 40, _, killed %xmm1 :: (store 8 into %stack.4)
+    %eax = IMPLICIT_DEF
+    %ecx = XOR32rr undef %ecx, undef %ecx, implicit-def dead %eflags
+  
+  bb.4.bb7:
+    successors: %bb.6.bb26(0x40000000), %bb.5.bb15(0x40000000)
+    liveins: %eax, %ecx, %esi, %r12, %xmm0
+  
+    %ebp = MOV32rr killed %ecx
+    %ebx = MOV32rr killed %eax, implicit-def %rbx
+    %r14d = MOV32rr %ebx, implicit-def %r14
+    TEST8rr %sil, %sil, implicit-def %eflags
+    JNE_1 %bb.6.bb26, implicit %eflags
+  
+  bb.5.bb15:
+    successors: %bb.6.bb26(0x80000000)
+    liveins: %ebp, %rbx, %r14, %xmm0
+  
+    MOV32mr %rsp, 1, _, 24, _, %ebx :: (store 4 into %stack.0, align 8)
+    MOV32mr %rsp, 1, _, 16, _, %ebp :: (store 4 into %stack.1, align 8)
+    MOVSDmr %rsp, 1, _, 8, _, killed %xmm0 :: (store 8 into %stack.2)
+    %rax = MOV64rm %rsp, 1, _, 32, _ :: (load 8 from %stack.5)
+    MOV64mr %rsp, 1, _, 48, _, killed %rax :: (store 8 into %stack.3)
+    %rax = MOV64ri @wibble
+    STATEPOINT 2882400000, 0, 0, killed %rax, 2, 0, 2, 0, 2, 30, 2, 1, 2, 0, 2, 99, 2, 0, 2, 12, 2, 0, 2, 10, 1, 8, %rsp, 24, 2, 10, 2, 0, 2, 10, 1, 8, %rsp, 16, 2, 10, 2, 4278124286, 2, 6, 2, 4278124286, 2, 7, 1, 8, %rsp, 8, 2, 99, 2, 0, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 13, 1, 8, %rsp, 48, 2, 7, 2, 4278124286, 2, 99, 2, 0, csr_64, implicit-def %rsp :: (volatile load 8 from %stack.0), (volatile load 8 from %stack.1), (volatile load 8 from %stack.2), (volatile load 8 from %stack.3)
+    %esi = XOR32rr undef %esi, undef %esi, implicit-def dead %eflags
+    %r12 = IMPLICIT_DEF
+  
+  bb.6.bb26:
+    successors: %bb.8.bb37(0x40000000), %bb.7.bb35(0x40000000)
+    liveins: %ebp, %esi, %rbx, %r12, %r14
+  
+    %rax = MOV64ri @global.1
+    %rax = MOV64rm killed %rax, 1, _, 0, _ :: (dereferenceable load 8 from @global.1)
+    TEST64rr %rax, %rax, implicit-def %eflags
+    %rax = CMOVE64rr undef %rax, killed %rax, implicit killed %eflags
+    %ecx = MOV32rm undef %rax, 1, _, 0, _ :: (load 4 from `i32* undef`)
+    %rdx = MOV64rm %r12, 8, %r14, 0, _ :: (load 8 from %ir.tmp3)
+    %r15 = LEA64r %rdx, 1, _, 1, _
+    MOV64mr %r12, 8, %r14, 0, _, %r15 :: (store 8 into %ir.tmp3)
+    %ecx = SUB32rr killed %ecx, %edx, implicit-def dead %eflags, implicit killed %rdx
+    MOV32mr undef %rax, 1, _, 0, _, killed %ecx :: (store 4 into `i32* undef`)
+    %r13 = MOV64rm killed %rax, 1, _, 768, _ :: (load 8 from %ir.tmp33)
+    TEST8rr %sil, %sil, implicit-def %eflags
+    %rax = IMPLICIT_DEF
+    JNE_1 %bb.8.bb37, implicit %eflags
+  
+  bb.7.bb35:
+    successors: %bb.8.bb37(0x80000000)
+    liveins: %ebp, %rbx, %r12, %r13, %r14, %r15
+  
+    %rsi = MOV64ri @global
+    %rax = MOV64ri @ham
+    CALL64r killed %rax, csr_64, implicit %rsp, implicit undef %rdi, implicit %rsi, implicit-def %rsp, implicit-def %rax
+    %esi = XOR32rr undef %esi, undef %esi, implicit-def dead %eflags
+  
+  bb.8.bb37:
+    successors: %bb.9.bb37(0x40000000), %bb.10.bb37(0x40000000)
+    liveins: %ebp, %esi, %rax, %rbx, %r12, %r13, %r14, %r15
+  
+    %rcx = MOV64rm killed %rax, 1, _, 760, _ :: (load 8 from %ir.tmp40)
+    CMP64rr %r13, %rcx, implicit-def %eflags
+    JL_1 %bb.10.bb37, implicit %eflags
+  
+  bb.9.bb37:
+    successors: %bb.10.bb37(0x80000000)
+    liveins: %ebp, %esi, %rbx, %r12, %r13, %r14, %r15
+  
+    %cl = MOV8rr %r13b, implicit killed %r13, implicit-def %rcx
+  
+  bb.10.bb37:
+    successors: %bb.11.bb51.loopexit(0x00000800), %bb.4.bb7(0x7ffff800)
+    liveins: %ebp, %esi, %rbx, %rcx, %r12, %r14, %r15
+  
+    %cl = KILL %cl, implicit killed %rcx
+    %r15 = SAR64rCL killed %r15, implicit-def dead %eflags, implicit %cl
+    MOV64mr %r12, 8, killed %r14, 0, _, killed %r15 :: (store 8 into %ir.tmp7)
+    MOV64mi32 undef %rax, 1, _, 0, _, 0 :: (store 8 into `i64* undef`)
+    %eax = LEA64_32r %rbx, 1, _, 1, _
+    %ecx = MOV32ri 6
+    CMP32ri %eax, 15141, implicit-def %eflags
+    %xmm0 = MOVSDrm %rsp, 1, _, 40, _ :: (load 8 from %stack.4)
+    JL_1 %bb.4.bb7, implicit %eflags
+  
+  bb.11.bb51.loopexit:
+    successors: %bb.12.bb51(0x80000000)
+    liveins: %ebp, %rbx
+  
+    %ebp = INC32r killed %ebp, implicit-def dead %eflags
+    %ebx = INC32r %ebx, implicit-def dead %eflags, implicit killed %rbx, implicit-def %rbx
+    %rax = MOV64ri %const.0
+    %xmm0 = MOVSDrm killed %rax, 1, _, 0, _ :: (load 8 from constant-pool)
+  
+  bb.12.bb51:
+    liveins: %ebp, %rbx, %xmm0
+  
+    MOV32mr %rsp, 1, _, 24, _, %ebx, implicit killed %rbx :: (store 4 into %stack.0, align 8)
+    MOV32mr %rsp, 1, _, 16, _, killed %ebp :: (store 4 into %stack.1, align 8)
+    MOVSDmr %rsp, 1, _, 8, _, killed %xmm0 :: (store 8 into %stack.2)
+    %rax = MOV64ri @wobble
+    %edi = MOV32ri -121
+    STATEPOINT 2882400000, 0, 1, killed %rax, %edi, 2, 0, 2, 0, 2, 38, 2, 1, 2, 0, 2, 270, 2, 4, 2, 12, 2, 0, 2, 11, 2, 4278124286, 2, 99, 2, 0, 2, 10, 1, 8, %rsp, 24, 2, 6, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 0, 2, 10, 1, 8, %rsp, 16, 2, 10, 2, 4278124286, 2, 99, 2, 0, 2, 7, 1, 8, %rsp, 8, 2, 99, 2, 0, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 13, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 0, csr_64, implicit-def %rsp :: (volatile load 8 from %stack.0), (volatile load 8 from %stack.1), (volatile load 8 from %stack.2)
+  
+  bb.13.bb59:
+    %rax = MOV64ri @wobble
+    %edi = MOV32ri 8
+    STATEPOINT 2882400000, 0, 1, killed %rax, %edi, 2, 0, 2, 0, 2, 38, 2, 1, 2, 0, 2, 123, 2, 4, 2, 12, 2, 0, 2, 13, 2, 0, 2, 99, 2, 4278124286, 2, 13, 2, 0, 2, 10, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 13, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 0, csr_64, implicit-def %rsp
+
+...
Index: test/CodeGen/X86/vector-shuffle-v48.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-v48.ll
+++ test/CodeGen/X86/vector-shuffle-v48.ll
@@ -3,42 +3,18 @@
 define <32 x i8> @foo(<48 x i8>* %x0, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqu 32(%rdi), %xmm0
-; CHECK-NEXT:    vmovdqu (%rdi), %ymm1
-; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vpextrb $0, %xmm2, %eax
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    vpextrb $2, %xmm2, %eax
-; CHECK-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    vpextrb $3, %xmm2, %eax
-; CHECK-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    vpextrb $5, %xmm2, %eax
-; CHECK-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    vpextrb $6, %xmm2, %eax
-; CHECK-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    vpextrb $1, %xmm0, %eax
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,11,12,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpextrb $2, %xmm0, %eax
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpextrb $4, %xmm0, %eax
-; CHECK-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpextrb $5, %xmm0, %eax
-; CHECK-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpextrb $7, %xmm0, %eax
-; CHECK-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpextrb $8, %xmm0, %eax
-; CHECK-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpextrb $10, %xmm0, %eax
-; CHECK-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpextrb $11, %xmm0, %eax
-; CHECK-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpextrb $13, %xmm0, %eax
-; CHECK-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    vpextrb $14, %xmm0, %eax
-; CHECK-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm0
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vmovdqu (%rdi), %ymm0
+; CHECK-NEXT:    vmovdqu 32(%rdi), %xmm1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
+; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,2,3,5,6]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,11,12,14,15,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0]
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %1 = load <48 x i8>, <48 x i8>* %x0, align 1
   %2 = shufflevector <48 x i8> %1, <48 x i8> undef, <32 x i32> <i32 0, i32 1, i32 3, i32 4, i32 6, i32 7, i32 9, i32 10, i32 12, i32 13, i32 15, i32 16, i32 18, i32 19, i32 21, i32 22, i32 24, i32 25, i32 27, i32 28, i32 30, i32 31, i32 33, i32 34, i32 36, i32 37, i32 39, i32 40, i32 42, i32 43, i32 45, i32 46>
Index: test/CodeGen/X86/xray-attribute-instrumentation.ll
===================================================================
--- test/CodeGen/X86/xray-attribute-instrumentation.ll
+++ test/CodeGen/X86/xray-attribute-instrumentation.ll
@@ -14,17 +14,16 @@
 ; CHECK-NEXT:  nopw %cs:512(%rax,%rax)
 }
 ; CHECK:       .p2align 4, 0x90
-; CHECK-NEXT:  .quad {{.*}}xray_synthetic_0
 ; CHECK-NEXT:  .quad {{.*}}xray_fn_idx_synth_0
 ; CHECK-NEXT:  .section {{.*}}xray_instr_map
-; CHECK-LABEL: Lxray_synthetic_0:
+; CHECK-LABEL: Lxray_sleds_start0:
 ; CHECK:       .quad {{.*}}xray_sled_0
 ; CHECK:       .quad {{.*}}xray_sled_1
-; CHECK-LABEL: Lxray_synthetic_end0:
+; CHECK-LABEL: Lxray_sleds_end0:
 ; CHECK:       .section {{.*}}xray_fn_idx
 ; CHECK-LABEL: Lxray_fn_idx_synth_0:
-; CHECK:       .quad {{.*}}xray_synthetic_0
-; CHECK-NEXT:  .quad {{.*}}xray_synthetic_end0
+; CHECK:       .quad {{.*}}xray_sleds_start0
+; CHECK-NEXT:  .quad {{.*}}xray_sleds_end0
 
 
 ; We test multiple returns in a single function to make sure we're getting all
@@ -52,15 +51,14 @@
 ; CHECK-NEXT:  nopw %cs:512(%rax,%rax)
 }
 ; CHECK:       .p2align 4, 0x90
-; CHECK-NEXT:  .quad {{.*}}xray_synthetic_1
 ; CHECK-NEXT:  .quad {{.*}}xray_fn_idx_synth_1
 ; CHECK-NEXT:  .section {{.*}}xray_instr_map
-; CHECK-LABEL: Lxray_synthetic_1:
+; CHECK-LABEL: Lxray_sleds_start1:
 ; CHECK:       .quad {{.*}}xray_sled_2
 ; CHECK:       .quad {{.*}}xray_sled_3
 ; CHECK:       .quad {{.*}}xray_sled_4
-; CHECK-LABEL: Lxray_synthetic_end1:
+; CHECK-LABEL: Lxray_sleds_end1:
 ; CHECK:       .section {{.*}}xray_fn_idx
 ; CHECK-LABEL: Lxray_fn_idx_synth_1:
-; CHECK:       .quad {{.*}}xray_synthetic_1
-; CHECK-NEXT:  .quad {{.*}}xray_synthetic_end1
+; CHECK:       .quad {{.*}}xray_sleds_start1
+; CHECK-NEXT:  .quad {{.*}}xray_sleds_end1
Index: test/CodeGen/X86/xray-custom-log.ll
===================================================================
--- test/CodeGen/X86/xray-custom-log.ll
+++ test/CodeGen/X86/xray-custom-log.ll
@@ -17,7 +17,7 @@
     ret i32 0
 }
 ; CHECK:       .section {{.*}}xray_instr_map
-; CHECK-LABEL: Lxray_synthetic_0:
+; CHECK-LABEL: Lxray_sleds_start0:
 ; CHECK:       .quad {{.*}}xray_event_sled_0
 
 declare void @llvm.xray.customevent(i8*, i32)
Index: test/CodeGen/X86/xray-log-args.ll
===================================================================
--- test/CodeGen/X86/xray-log-args.ll
+++ test/CodeGen/X86/xray-log-args.ll
@@ -6,7 +6,7 @@
 define i32 @callee(i32 %arg) nounwind noinline uwtable "function-instrument"="xray-always" "xray-log-args"="1" {
   ret i32 %arg
 }
-; CHECK-LABEL: Lxray_synthetic_0:
+; CHECK-LABEL: Lxray_sleds_start0:
 ; CHECK:	.quad	{{\.?}}Lxray_sled_0
 ; CHECK:	.quad	{{_?}}callee
 ; CHECK:	.byte	3
@@ -22,7 +22,7 @@
   %retval = tail call i32 @callee(i32 %arg)
   ret i32 %retval
 }
-; CHECK-LABEL: Lxray_synthetic_1:
+; CHECK-LABEL: Lxray_sleds_start1:
 ; CHECK:	.quad	{{\.?}}Lxray_sled_2
 ; CHECK:	.quad	{{_?}}caller
 ; CHECK:	.byte	3
Index: test/CodeGen/X86/xray-tail-call-sled.ll
===================================================================
--- test/CodeGen/X86/xray-tail-call-sled.ll
+++ test/CodeGen/X86/xray-tail-call-sled.ll
@@ -14,17 +14,16 @@
 ; CHECK-NEXT:  nopw %cs:512(%rax,%rax)
 }
 ; CHECK:       .p2align 4, 0x90
-; CHECK-NEXT:  .quad {{.*}}xray_synthetic_0{{.*}}
 ; CHECK-NEXT:  .quad {{.*}}xray_fn_idx_synth_0{{.*}}
 ; CHECK-NEXT:  .section {{.*}}xray_instr_map
-; CHECK-LABEL: Lxray_synthetic_0:
+; CHECK-LABEL: Lxray_sleds_start0:
 ; CHECK:       .quad {{.*}}xray_sled_0
 ; CHECK:       .quad {{.*}}xray_sled_1
-; CHECK-LABEL: Lxray_synthetic_end0:
+; CHECK-LABEL: Lxray_sleds_end0:
 ; CHECK-NEXT:  .section {{.*}}xray_fn_idx
 ; CHECK-LABEL: Lxray_fn_idx_synth_0:
-; CHECK:       .quad {{.*}}xray_synthetic_0
-; CHECK-NEXT:  .quad {{.*}}xray_synthetic_end0
+; CHECK:       .quad {{.*}}xray_sleds_start0
+; CHECK-NEXT:  .quad {{.*}}xray_sleds_end0
 
 define i32 @caller() nounwind noinline uwtable "function-instrument"="xray-always" {
 ; CHECK:       .p2align 1, 0x90
@@ -42,13 +41,12 @@
   ret i32 %retval
 }
 ; CHECK:       .p2align 4, 0x90
-; CHECK-NEXT:  .quad {{.*}}xray_synthetic_1{{.*}}
 ; CHECK-NEXT:  .quad {{.*}}xray_fn_idx_synth_1{{.*}}
-; CHECK-LABEL: Lxray_synthetic_1:
+; CHECK-LABEL: Lxray_sleds_start1:
 ; CHECK:       .quad {{.*}}xray_sled_2
 ; CHECK:       .quad {{.*}}xray_sled_3
-; CHECK-LABEL: Lxray_synthetic_end1:
+; CHECK-LABEL: Lxray_sleds_end1:
 ; CHECK:       .section {{.*}}xray_fn_idx
 ; CHECK-LABEL: Lxray_fn_idx_synth_1:
-; CHECK:       .quad {{.*}}xray_synthetic_1
-; CHECK:       .quad {{.*}}xray_synthetic_end1
+; CHECK:       .quad {{.*}}xray_sleds_start1
+; CHECK:       .quad {{.*}}xray_sleds_end1
Index: test/DebugInfo/COFF/globals.ll
===================================================================
--- test/DebugInfo/COFF/globals.ll
+++ test/DebugInfo/COFF/globals.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s | FileCheck %s --check-prefix=ASM
 ; RUN: llc < %s -filetype=obj | llvm-readobj - -codeview | FileCheck %s --check-prefix=OBJ
+; RUN: llc < %s -filetype=obj | obj2yaml | FileCheck %s --check-prefix=YAML
 
 ; C++ source to regenerate:
 ; $ cat t.cpp
@@ -109,6 +110,43 @@
 ; OBJ:   ]
 ; OBJ: ]
 
+; YAML-LABEL:  - Name:            '.debug$S'
+; YAML:    Subsections:
+; YAML:      - !Symbols
+; YAML:        Records:
+; YAML:          - Kind:            S_COMPILE3
+; YAML:            Compile3Sym:
+; YAML:      - !Symbols
+; YAML:        Records:
+; YAML:          - Kind:            S_LDATA32
+; YAML:            DataSym:
+; YAML-NOT: Segment
+; YAML:              Type:            116
+; YAML-NOT: Segment
+; YAML:              DisplayName:     first
+; YAML-NOT: Segment
+; YAML:          - Kind:            S_GTHREAD32
+; YAML:            ThreadLocalDataSym:
+; YAML:              Type:            4097
+; YAML:              DisplayName:     middle
+; YAML:          - Kind:            S_GDATA32
+; YAML:            DataSym:
+; YAML-NOT: Segment
+; YAML:              Type:            116
+; YAML-NOT: Offset
+; YAML-NOT: Segment
+; YAML:              DisplayName:     last
+; YAML-NOT: Segment
+
+; The missing offsets are represented as relocations against this section.
+; YAML:    Relocations:
+; YAML:      - VirtualAddress:  92
+; YAML:        SymbolName:      '?first@@3HA'
+; YAML:        Type:            IMAGE_REL_AMD64_SECREL
+; YAML:      - VirtualAddress:  96
+; YAML:        SymbolName:      '?first@@3HA'
+; YAML:        Type:            IMAGE_REL_AMD64_SECTION
+
 ; ModuleID = 't.cpp'
 source_filename = "t.cpp"
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
Index: test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir
===================================================================
--- /dev/null
+++ test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir
@@ -0,0 +1,249 @@
+# RUN: llc -start-after=livedebugvalues -filetype=obj -o - %s \
+# RUN:   | llvm-dwarfdump - | FileCheck %s
+
+# This tests for a crash in DwarfDebug's singular DBG_VALUE range promotion when
+#  encountering an IMPLICIT_DEF in its own lexical scope.
+
+# CHECK: .debug_info contents:
+# CHECK: DW_TAG_formal_parameter
+# CHECK:   DW_AT_const_value [DW_FORM_udata]   (0)
+--- |
+  ; ModuleID = 't.ll'
+  source_filename = "t.ll"
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64--linux-gnu"
+  
+  %class.v = type <{ i32, i8, i8, [2 x i8] }>
+  %class.j = type <{ %"class.j<6, a::f>::D", i32, [4 x i8] }>
+  %"class.j<6, a::f>::D" = type { %"class.j<6, a::f>::p" }
+  %"class.j<6, a::f>::p" = type { i64 }
+  
+  @bt = global i32 0, align 4
+  
+  define void @_ZN1v2bvEv(%class.v* nocapture readonly %this) local_unnamed_addr align 2 !dbg !14 {
+  entry:
+    %bz = alloca %class.j, align 8
+    %att = alloca %class.j, align 8
+    %ap = getelementptr inbounds %class.v, %class.v* %this, i64 0, i32 1
+    %0 = load i8, i8* %ap, align 4
+    %conv = sext i8 %0 to i32
+    switch i32 %conv, label %sw.epilog [
+      i32 1, label %_ZN1jILi6EN1a1fEE1mEj.exit
+      i32 0, label %sw.bb2
+    ]
+  
+  _ZN1jILi6EN1a1fEE1mEj.exit:                       ; preds = %entry
+    %1 = bitcast %class.j* %att to i64*
+    %2 = bitcast %class.j* %bz to i64*
+    store i64 1, i64* %2, align 8
+    call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !18, metadata !23), !dbg !24
+    store i64 1, i64* %1, align 8, !dbg !27
+    br label %sw.epilog
+  
+  sw.bb2:                                           ; preds = %entry
+    %3 = bitcast %class.j* %att to i64*
+    %4 = bitcast %class.j* %bz to i64*
+    %.pre = load i64, i64* %3, align 8
+    %agg.tmp.sroa.2.0..sroa_idx1.i.i.i.i.i.i13.phi.trans.insert = getelementptr inbounds %class.j, %class.j* %bz, i64 0, i32 1
+    %.phi.trans.insert = bitcast i32* %agg.tmp.sroa.2.0..sroa_idx1.i.i.i.i.i.i13.phi.trans.insert to i64*
+    %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i14.pre = load i64, i64* %.phi.trans.insert, align 8
+    %.pre25 = load i64, i64* %4, align 8
+    %agg.tmp.sroa.2.0..sroa_idx1.i.i.i.i.i.i.phi.trans.insert = getelementptr inbounds %class.j, %class.j* %att, i64 0, i32 1
+    %.phi.trans.insert26 = bitcast i32* %agg.tmp.sroa.2.0..sroa_idx1.i.i.i.i.i.i.phi.trans.insert to i64*
+    %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i.pre = load i64, i64* %.phi.trans.insert26, align 8
+    br label %sw.epilog
+  
+  sw.epilog:                                        ; preds = %sw.bb2, %_ZN1jILi6EN1a1fEE1mEj.exit, %entry
+    %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i = phi i64 [ %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i.pre, %sw.bb2 ], [ undef, %entry ], [ undef, %_ZN1jILi6EN1a1fEE1mEj.exit ], !dbg !32
+    %5 = phi i64 [ %.pre25, %sw.bb2 ], [ 0, %entry ], [ 1, %_ZN1jILi6EN1a1fEE1mEj.exit ]
+    %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i14 = phi i64 [ %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i14.pre, %sw.bb2 ], [ undef, %entry ], [ undef, %_ZN1jILi6EN1a1fEE1mEj.exit ]
+    %6 = phi i64 [ %.pre, %sw.bb2 ], [ 0, %entry ], [ 1, %_ZN1jILi6EN1a1fEE1mEj.exit ]
+    %bw1 = bitcast %class.v* %this to i32*
+    %7 = load i32, i32* %bw1, align 4
+    %bx = getelementptr inbounds %class.v, %class.v* %this, i64 0, i32 2
+    %8 = load i8, i8* %bx, align 1
+    %tobool = icmp ne i8 %8, 0
+    %.fca.0.insert9 = insertvalue [2 x i64] undef, i64 %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i14, 0
+    %.fca.1.insert12 = insertvalue [2 x i64] %.fca.0.insert9, i64 %5, 1
+    %.fca.0.insert = insertvalue [2 x i64] undef, i64 %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i, 0
+    %.fca.1.insert = insertvalue [2 x i64] %.fca.0.insert, i64 %6, 1
+    call void @_Z2byi1LS_bbPi(i32 %7, [2 x i64] %.fca.1.insert12, [2 x i64] %.fca.1.insert, i1 %tobool, i1 false, i32* nonnull @bt)
+    ret void
+  }
+  
+  declare void @_Z2byi1LS_bbPi(i32, [2 x i64], [2 x i64], i1, i1, i32*) local_unnamed_addr
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #0
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #1
+  
+  attributes #0 = { nounwind readnone speculatable }
+  attributes #1 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!12, !13}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 305696) (llvm/trunk 305708)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !2)
+  !1 = !DIFile(filename: "/<stdin>", directory: "/")
+  !2 = !{}
+  !3 = !{!4, !10}
+  !4 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "r", file: !5, line: 50, size: 8, elements: !6, identifier: "_ZTS1r")
+  !5 = !DIFile(filename: "current.ii", directory: "/")
+  !6 = !{!7}
+  !7 = !DISubprogram(name: "r", scope: !4, file: !5, line: 52, type: !8, isLocal: false, isDefinition: false, scopeLine: 52, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true)
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{null}
+  !10 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "q", scope: !11, file: !5, line: 39, size: 64, elements: !2, identifier: "_ZTSN1jILi6EN1a1fEE1qE")
+  !11 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "j<6, a::f>", file: !5, line: 7, size: 128, elements: !2, templateParams: !2, identifier: "_ZTS1jILi6EN1a1fEE")
+  !12 = !{i32 2, !"Debug Info Version", i32 3}
+  !13 = !{i32 1, !"wchar_size", i32 4}
+  !14 = distinct !DISubprogram(name: "bv", linkageName: "_ZN1v2bvEv", scope: !15, file: !5, line: 104, type: !16, isLocal: false, isDefinition: true, scopeLine: 104, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !17, variables: !2)
+  !15 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "v", file: !5, line: 97, size: 64, elements: !2, identifier: "_ZTS1v")
+  !16 = !DISubroutineType(types: !2)
+  !17 = !DISubprogram(name: "bv", linkageName: "_ZN1v2bvEv", scope: !15, file: !5, line: 98, type: !16, isLocal: false, isDefinition: false, scopeLine: 98, flags: DIFlagPrototyped, isOptimized: true)
+  !18 = !DILocalVariable(arg: 2, scope: !19, file: !5, line: 22, type: !21)
+  !19 = distinct !DISubprogram(name: "m", linkageName: "_ZN1jILi6EN1a1fEE1mEj", scope: !11, file: !5, line: 22, type: !16, isLocal: false, isDefinition: true, scopeLine: 22, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !20, variables: !2)
+  !20 = !DISubprogram(name: "m", linkageName: "_ZN1jILi6EN1a1fEE1mEj", scope: !11, file: !5, line: 22, type: !16, isLocal: false, isDefinition: false, scopeLine: 22, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true)
+  !21 = !DIDerivedType(tag: DW_TAG_typedef, name: "h", file: !5, line: 10, baseType: !22)
+  !22 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+  !23 = !DIExpression()
+  !24 = !DILocation(line: 22, scope: !19, inlinedAt: !25)
+  !25 = distinct !DILocation(line: 109, scope: !26)
+  !26 = distinct !DILexicalBlock(scope: !14, file: !5, line: 106)
+  !27 = !DILocation(line: 29, scope: !28, inlinedAt: !31)
+  !28 = distinct !DISubprogram(name: "n", linkageName: "_ZN1jILi6EN1a1fEE1p1nEl", scope: !29, file: !5, line: 29, type: !8, isLocal: false, isDefinition: true, scopeLine: 29, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !30, variables: !2)
+  !29 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "p", scope: !11, file: !5, line: 26, size: 64, elements: !2, identifier: "_ZTSN1jILi6EN1a1fEE1pE")
+  !30 = !DISubprogram(name: "n", linkageName: "_ZN1jILi6EN1a1fEE1p1nEl", scope: !29, file: !5, line: 29, type: !8, isLocal: false, isDefinition: false, scopeLine: 29, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true)
+  !31 = distinct !DILocation(line: 24, scope: !19, inlinedAt: !25)
+  !32 = !DILocation(line: 61, scope: !33, inlinedAt: !38)
+  !33 = distinct !DISubprogram(name: "bc<j<6, a::f> >", linkageName: "_ZN1s2bcI1jILi6EN1a1fEEEEDTcl2badeclsr1aE2aaIPT_EEEES6_", scope: !34, file: !5, line: 60, type: !16, isLocal: false, isDefinition: true, scopeLine: 60, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !36, declaration: !35, variables: !2)
+  !34 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s", file: !5, line: 59, size: 8, elements: !2, identifier: "_ZTS1s")
+  !35 = !DISubprogram(name: "bc<j<6, a::f> >", linkageName: "_ZN1s2bcI1jILi6EN1a1fEEEEDTcl2badeclsr1aE2aaIPT_EEEES6_", scope: !34, file: !5, line: 60, type: !16, isLocal: false, isDefinition: false, scopeLine: 60, flags: DIFlagPrototyped, isOptimized: true, templateParams: !36)
+  !36 = !{!37}
+  !37 = !DITemplateTypeParameter(name: "ay", type: !11)
+  !38 = distinct !DILocation(line: 70, scope: !39, inlinedAt: !42)
+  !39 = distinct !DISubprogram(name: "bc", linkageName: "_ZN1JI1s1jILi6EN1a1fEEE2bcEPS4_", scope: !40, file: !5, line: 70, type: !16, isLocal: false, isDefinition: true, scopeLine: 70, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !41, variables: !2)
+  !40 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "J<s, j<6, a::f> >", file: !5, line: 69, size: 8, elements: !2, templateParams: !2, identifier: "_ZTS1JI1s1jILi6EN1a1fEEE")
+  !41 = !DISubprogram(name: "bc", linkageName: "_ZN1JI1s1jILi6EN1a1fEEE2bcEPS4_", scope: !40, file: !5, line: 70, type: !16, isLocal: false, isDefinition: false, scopeLine: 70, flags: DIFlagPrototyped, isOptimized: true)
+  !42 = distinct !DILocation(line: 85, scope: !43, inlinedAt: !46)
+  !43 = distinct !DISubprogram(name: "u<j<6, a::f> >", linkageName: "_ZN1uC2I1jILi6EN1a1fEEEERT_", scope: !44, file: !5, line: 85, type: !16, isLocal: false, isDefinition: true, scopeLine: 85, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !36, declaration: !45, variables: !2)
+  !44 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "u", file: !5, line: 82, size: 128, elements: !2, identifier: "_ZTS1u")
+  !45 = !DISubprogram(name: "u<j<6, a::f> >", scope: !44, file: !5, line: 85, type: !16, isLocal: false, isDefinition: false, scopeLine: 85, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true, templateParams: !36)
+  !46 = distinct !DILocation(line: 85, scope: !47, inlinedAt: !48)
+  !47 = distinct !DISubprogram(name: "u<j<6, a::f> >", linkageName: "_ZN1uC1I1jILi6EN1a1fEEEERT_", scope: !44, file: !5, line: 85, type: !16, isLocal: false, isDefinition: true, scopeLine: 85, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !36, declaration: !45, variables: !2)
+  !48 = distinct !DILocation(line: 92, scope: !49, inlinedAt: !52)
+  !49 = distinct !DISubprogram(name: "L<j<6, a::f> >", linkageName: "_ZN1LC2I1jILi6EN1a1fEEEERT_", scope: !50, file: !5, line: 92, type: !16, isLocal: false, isDefinition: true, scopeLine: 92, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !2, declaration: !51, variables: !2)
+  !50 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "L", file: !5, line: 88, size: 128, elements: !2, identifier: "_ZTS1L")
+  !51 = !DISubprogram(name: "L<j<6, a::f> >", scope: !50, file: !5, line: 92, type: !16, isLocal: false, isDefinition: false, scopeLine: 92, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true, templateParams: !2)
+  !52 = distinct !DILocation(line: 92, scope: !53, inlinedAt: !54)
+  !53 = distinct !DISubprogram(name: "L<j<6, a::f> >", linkageName: "_ZN1LC1I1jILi6EN1a1fEEEERT_", scope: !50, file: !5, line: 92, type: !16, isLocal: false, isDefinition: true, scopeLine: 92, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !2, declaration: !51, variables: !2)
+  !54 = distinct !DILocation(line: 114, scope: !14)
+
+...
+---
+name:            _ZN1v2bvEv
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:       
+liveins:         
+  - { reg: '%x0', virtual-reg: '' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       48
+  offsetAdjustment: 0
+  maxAlignment:    16
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      
+stack:           
+  - { id: 0, name: bz, type: default, offset: -32, size: 16, alignment: 8, 
+      callee-saved-register: '', local-offset: -16, di-variable: '', di-expression: '', 
+      di-location: '' }
+  - { id: 1, name: att, type: default, offset: -48, size: 16, alignment: 8, 
+      callee-saved-register: '', local-offset: -32, di-variable: '', di-expression: '', 
+      di-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16, 
+      callee-saved-register: '%lr', di-variable: '', di-expression: '', 
+      di-location: '' }
+constants:       
+body:             |
+  bb.0.entry:
+    successors: %bb.3.sw.bb2(0x2aaaaaab), %bb.1.entry(0x55555555)
+    liveins: %x0, %lr
+  
+    %sp = frame-setup SUBXri %sp, 48, 0
+    frame-setup STRXui killed %lr, %sp, 4 :: (store 8 into %stack.2)
+    frame-setup CFI_INSTRUCTION def_cfa_offset 48
+    frame-setup CFI_INSTRUCTION offset %w30, -16
+    %w8 = LDRSBWui %x0, 4 :: (load 1 from %ir.ap, align 4)
+    CBZW %w8, %bb.3.sw.bb2
+  
+  bb.1.entry:
+    successors: %bb.2._ZN1jILi6EN1a1fEE1mEj.exit(0x40000001), %bb.4(0x3fffffff)
+    liveins: %w8, %x0
+  
+    dead %wzr = SUBSWri killed %w8, 1, 0, implicit-def %nzcv
+    Bcc 1, %bb.4, implicit %nzcv
+  
+  bb.2._ZN1jILi6EN1a1fEE1mEj.exit:
+    successors: %bb.5.sw.epilog(0x80000000)
+    liveins: %x0
+  
+    %w2 = ORRWri %wzr, 0, implicit-def %x2
+    %x3 = IMPLICIT_DEF debug-location !32
+    %x1 = IMPLICIT_DEF
+    STRXui %x2, %sp, 2 :: (store 8 into %ir.2)
+    DBG_VALUE 0, 0, !18, !23, debug-location !24
+    STRXui %x2, %sp, 0, debug-location !27 :: (store 8 into %ir.1)
+    %w4 = ORRWri %wzr, 0, implicit-def %x4
+    B %bb.5.sw.epilog
+  
+  bb.3.sw.bb2:
+    successors: %bb.5.sw.epilog(0x80000000)
+    liveins: %x0
+  
+    %x4, %x3 = LDPXi %sp, 0 :: (dereferenceable load 8 from %ir.3), (dereferenceable load 8 from %ir..phi.trans.insert26)
+    %x2, %x1 = LDPXi %sp, 2 :: (dereferenceable load 8 from %ir..phi.trans.insert), (dereferenceable load 8 from %ir.4)
+    B %bb.5.sw.epilog
+  
+  bb.4:
+    successors: %bb.5.sw.epilog(0x80000000)
+    liveins: %x0
+  
+    %x2 = ORRXrs %xzr, %xzr, 0
+    %x4 = ORRXrs %xzr, %xzr, 0
+    %x3 = IMPLICIT_DEF debug-location !32
+    %x1 = IMPLICIT_DEF
+  
+  bb.5.sw.epilog:
+    liveins: %x0, %x1, %x2, %x3, %x4
+  
+    %w8 = LDRBBui %x0, 5 :: (load 1 from %ir.bx)
+    %w0 = LDRWui killed %x0, 0 :: (load 4 from %ir.bw1)
+    %x7 = ADRP target-flags(aarch64-page) @bt
+    %x7 = ADDXri killed %x7, target-flags(aarch64-pageoff, aarch64-nc) @bt, 0
+    dead %wzr = SUBSWri killed %w8, 0, 0, implicit-def %nzcv
+    %w5 = CSINCWr %wzr, %wzr, 0, implicit killed %nzcv
+    %w6 = ORRWrs %wzr, %wzr, 0
+    BL @_Z2byi1LS_bbPi, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit killed %w0, implicit killed %x1, implicit killed %x2, implicit killed %x3, implicit killed %x4, implicit killed %w5, implicit killed %w6, implicit killed %x7, implicit-def %sp
+    %lr = LDRXui %sp, 4 :: (load 8 from %stack.2)
+    %sp = ADDXri %sp, 48, 0
+    RET undef %lr
+
+...
Index: test/DebugInfo/PDB/pdb-yaml-symbols.test
===================================================================
--- test/DebugInfo/PDB/pdb-yaml-symbols.test
+++ test/DebugInfo/PDB/pdb-yaml-symbols.test
@@ -55,6 +55,7 @@
 YAML:               DbgStart:        3
 YAML:               DbgEnd:          8
 YAML:               FunctionType:    4097
+YAML:               Offset:          16
 YAML:               Segment:         1
 YAML:               Flags:           [ HasFP ]
 YAML:               DisplayName:     main
@@ -178,4 +179,4 @@
 YAML:               Length:          8
 YAML:               Characteristics: 1107296320
 YAML:               Name:            .reloc
-YAML: ...
\ No newline at end of file
+YAML: ...
Index: test/MC/AMDGPU/flat-global.s
===================================================================
--- /dev/null
+++ test/MC/AMDGPU/flat-global.s
@@ -0,0 +1,87 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding 2>&1 %s | FileCheck -check-prefix=GFX9-ERR -check-prefix=GCNERR %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding 2>&1 %s | FileCheck -check-prefix=VI-ERR -check-prefix=GCNERR %s
+
+global_load_ubyte v1, v[3:4]
+// GFX9: global_load_ubyte v1, v[3:4]      ; encoding: [0x00,0x80,0x40,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: instruction not supported on this GPU
+
+global_load_sbyte v1, v[3:4]
+// GFX9: global_load_sbyte v1, v[3:4]      ; encoding: [0x00,0x80,0x44,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: instruction not supported on this GPU
+
+global_load_ushort v1, v[3:4]
+// GFX9: global_load_ushort v1, v[3:4]      ; encoding: [0x00,0x80,0x48,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: instruction not supported on this GPU
+
+global_load_sshort v1, v[3:4]
+// GFX9: global_load_sshort v1, v[3:4]      ; encoding: [0x00,0x80,0x4c,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: instruction not supported on this GPU
+
+global_load_dword v1, v[3:4]
+// GFX9: global_load_dword v1, v[3:4]      ; encoding: [0x00,0x80,0x50,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: instruction not supported on this GPU
+
+global_load_dwordx2 v[1:2], v[3:4]
+// GFX9: global_load_dwordx2 v[1:2], v[3:4]      ; encoding: [0x00,0x80,0x54,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: instruction not supported on this GPU
+
+global_load_dwordx3 v[1:3], v[3:4]
+// GFX9: global_load_dwordx3 v[1:3], v[3:4]      ; encoding: [0x00,0x80,0x58,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: instruction not supported on this GPU
+
+global_load_dwordx4 v[1:4], v[3:4]
+// GFX9: global_load_dwordx4 v[1:4], v[3:4]      ; encoding: [0x00,0x80,0x5c,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: instruction not supported on this GPU
+// FIXME: VI error should be instruction nto supported
+global_load_dword v1, v[3:4] offset:0
+// GFX9: global_load_dword v1, v[3:4]      ; encoding: [0x00,0x80,0x50,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: :36: error: not a valid operand.
+
+global_load_dword v1, v[3:4] offset:4095
+// GFX9: global_load_dword v1, v[3:4] offset:4095 ; encoding: [0xff,0x8f,0x50,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: :36: error: not a valid operand.
+
+global_load_dword v1, v[3:4] offset:-1
+// GFX9: global_load_dword v1, v[3:4] offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: :36: error: not a valid operand.
+
+global_load_dword v1, v[3:4] offset:-4096
+// GFX9: global_load_dword v1, v[3:4] offset:-4096 ; encoding: [0x00,0x90,0x50,0xdc,0x03,0x00,0x00,0x01]
+// VI-ERR: :36: error: not a valid operand.
+
+global_load_dword v1, v[3:4] offset:4096
+// GFX9-ERR: :30: error: invalid operand for instruction
+// VI-ERR: :36: error: not a valid operand.
+
+global_load_dword v1, v[3:4] offset:-4097
+// GFX9-ERR: :30: error: invalid operand for instruction
+// VI-ERR: :36: error: not a valid operand.
+
+global_store_byte v[3:4], v1
+// GFX9: global_store_byte v[3:4], v1 ; encoding: [0x00,0x80,0x60,0xdc,0x03,0x01,0x00,0x00]
+// VI-ERR: instruction not supported on this GPU
+
+global_store_short v[3:4], v1
+// GFX9: global_store_short v[3:4], v1 ; encoding: [0x00,0x80,0x68,0xdc,0x03,0x01,0x00,0x00]
+// VI-ERR: instruction not supported on this GPU
+
+global_store_dword v[3:4], v1
+// GFX9: global_store_dword v[3:4], v1 ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x00,0x00]
+// VI-ERR: instruction not supported on this GPU
+
+global_store_dwordx2 v[3:4], v[1:2]
+// GFX9: global_store_dwordx2 v[3:4], v[1:2] ; encoding: [0x00,0x80,0x74,0xdc,0x03,0x01,0x00,0x00]
+// VI-ERR: instruction not supported on this GPU
+
+global_store_dwordx3 v[3:4], v[1:3]
+// GFX9: global_store_dwordx3 v[3:4], v[1:3] ; encoding: [0x00,0x80,0x78,0xdc,0x03,0x01,0x00,0x00]
+// VI-ERR: instruction not supported on this GPU
+
+global_store_dwordx4 v[3:4], v[1:4]
+// GFX9: global_store_dwordx4 v[3:4], v[1:4] ; encoding: [0x00,0x80,0x7c,0xdc,0x03,0x01,0x00,0x00]
+// VI-ERR: instruction not supported on this GPU
+
+global_store_dword v[3:4], v1 offset:12
+// GFX9: global_store_dword v[3:4], v1 offset:12 ; encoding: [0x0c,0x80,0x70,0xdc,0x03,0x01,0x00,0x00]
+// VI-ERR: :37: error: not a valid operand
Index: test/MC/AMDGPU/mtbuf.s
===================================================================
--- /dev/null
+++ test/MC/AMDGPU/mtbuf.s
@@ -0,0 +1,36 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=tahiti -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICI %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICI %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+//===----------------------------------------------------------------------===//
+// Test for dfmt and nfmt (tbuffer only)
+//===----------------------------------------------------------------------===//
+
+tbuffer_load_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_load_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_load_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_load_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_load_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_load_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7b,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_load_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_store_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_store_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_store_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_store_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_store_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_store_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_store_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1
+// SICI: tbuffer_store_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01]
+// VI:   tbuffer_store_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x01,0x01]
+
+tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1
+// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7],  dfmt:15,  nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71]
+// VI:   tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15,  nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71]
+
Index: test/MC/ARM/elf-movt.s
===================================================================
--- test/MC/ARM/elf-movt.s
+++ test/MC/ARM/elf-movt.s
@@ -1,6 +1,6 @@
 @ RUN: llvm-mc %s -triple=armv7-linux-gnueabi | FileCheck -check-prefix=ASM %s
-@ RUN: llvm-mc %s -triple=armv7-linux-gnueabi -filetype=obj -o - | \
-@ RUN:    llvm-readobj -s -sd -sr | FileCheck -check-prefix=OBJ %s
+@ RUN: llvm-mc %s -triple=armv7-linux-gnueabi -filetype=obj -o %t.o
+@ RUN:    llvm-objdump -d -r %t.o -triple=armv7-linux-gnueabi | FileCheck -check-prefix=OBJ %s
 	.syntax unified
 	.text
 	.globl	barf
@@ -14,41 +14,9 @@
 @ ASM:          movw    r0, :lower16:(GOT-(.LPC0_2+8))
 @ ASM-NEXT:     movt    r0, :upper16:(GOT-(.LPC0_2+8))
 
-@@ make sure that the text section fixups are sane too
-@ OBJ:        Section {
-@ OBJ:          Name: .text
-@ OBJ-NEXT:     Type: SHT_PROGBITS
-@ OBJ-NEXT:     Flags [ (0x6)
-@ OBJ-NEXT:       SHF_ALLOC
-@ OBJ-NEXT:       SHF_EXECINSTR
-@ OBJ-NEXT:     ]
-@ OBJ-NEXT:     Address: 0x0
-@ OBJ-NEXT:     Offset: 0x34
-@ OBJ-NEXT:     Size: 8
-@ OBJ-NEXT:     Link: 0
-@ OBJ-NEXT:     Info: 0
-@ OBJ-NEXT:     AddressAlignment: 4
-@ OBJ-NEXT:     EntrySize: 0
-@ OBJ-NEXT:     Relocations [
-@ OBJ-NEXT:     ]
-@ OBJ-NEXT:     SectionData (
-@ OBJ-NEXT:       0000: F00F0FE3 F40F4FE3
-@ OBJ-NEXT:     )
-@ OBJ-NEXT:   }
-@ OBJ:        Section {
-@ OBJ:          Index:
-@ OBJ:          Name: .rel.text
-@ OBJ-NEXT:     Type: SHT_REL (0x9)
-@ OBJ-NEXT:     Flags [ (0x0)
-@ OBJ-NEXT:     ]
-@ OBJ-NEXT:     Address: 0x0
-@ OBJ-NEXT:     Offset:
-@ OBJ-NEXT:     Size: 16
-@ OBJ-NEXT:     Link:
-@ OBJ-NEXT:     Info:
-@ OBJ-NEXT:     AddressAlignment: 4
-@ OBJ-NEXT:     EntrySize: 8
-@ OBJ-NEXT:     Relocations [
-@ OBJ-NEXT:       0x0 R_ARM_MOVW_PREL_NC GOT 0x0
-@ OBJ-NEXT:       0x4 R_ARM_MOVT_PREL GOT 0x0
-@ OBJ-NEXT:   ]
+@OBJ:      Disassembly of section .text:
+@OBJ-NEXT: barf:
+@OBJ-NEXT: 0:             f0 0f 0f e3     movw    r0, #65520
+@OBJ-NEXT: 00000000:         R_ARM_MOVW_PREL_NC   GOT
+@OBJ-NEXT: 4:             f4 0f 4f e3     movt    r0, #65524
+@OBJ-NEXT: 00000004:         R_ARM_MOVT_PREL      GOT
Index: test/MC/Disassembler/AMDGPU/mtbuf_vi.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/AMDGPU/mtbuf_vi.txt
@@ -0,0 +1,22 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=tonga -disassemble -show-encoding < %s | FileCheck %s -check-prefix=VI
+
+# VI:   tbuffer_load_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x00 0x78 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_load_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x80 0x78 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_load_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x80 0x79 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_store_format_x v1, off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x00 0x7a 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_store_format_xy v[1:2], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x80 0x7a 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_store_format_xyzw v[1:4], off, s[4:7],  dfmt:15,  nfmt:2, s1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x01,0x01]
+0x00 0x80 0x7b 0xe9 0x00 0x01 0x01 0x01
+
+# VI:   tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7],  dfmt:15,  nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71]
+0x00 0x80 0x7b 0xe9 0x00 0x01 0x1d 0x71
Index: test/MC/ELF/bad-expr2.s
===================================================================
--- test/MC/ELF/bad-expr2.s
+++ test/MC/ELF/bad-expr2.s
@@ -1,11 +1,10 @@
 // RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o /dev/null \
 // RUN: 2>&1 | FileCheck %s
 
-// CHECK: No relocation available to represent this relative expression
-// CHECK: call foo - bar
-
-
+// CHECK: [[@LINE+2]]:{{[0-9]+}}: error: No relocation available to represent this relative expression
+// CHECK-NEXT: call foo - bar
         call foo - bar
+
         .section .foo
 foo:
         .section .bar
Index: test/Transforms/CodeGenPrepare/X86/memcmp.ll
===================================================================
--- test/Transforms/CodeGenPrepare/X86/memcmp.ll
+++ test/Transforms/CodeGenPrepare/X86/memcmp.ll
@@ -6,9 +6,47 @@
 declare i32 @memcmp(i8* nocapture, i8* nocapture, i64)
 
 define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp2(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp2(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* %x to i16*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* %y to i16*
+; X32-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
+; X32-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+; X32-NEXT:    br i1 [[TMP9]], label %res_block, label %endblock
+; X32:       res_block:
+; X32-NEXT:    [[TMP10:%.*]] = icmp ult i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1
+; X32-NEXT:    br label %endblock
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp2(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* %x to i16*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* %y to i16*
+; X64-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0
+; X64-NEXT:    br i1 [[TMP9]], label %res_block, label %endblock
+; X64:       res_block:
+; X64-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1
+; X64-NEXT:    br label %endblock
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
   ret i32 %call
@@ -24,9 +62,45 @@
 }
 
 define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp4(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp4(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* %x to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* %y to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X32-NEXT:    br i1 [[TMP7]], label %res_block, label %endblock
+; X32:       res_block:
+; X32-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1
+; X32-NEXT:    br label %endblock
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP9]], %res_block ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp4(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* %x to i32*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* %y to i32*
+; X64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0
+; X64-NEXT:    br i1 [[TMP9]], label %res_block, label %endblock
+; X64:       res_block:
+; X64-NEXT:    [[TMP10:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1
+; X64-NEXT:    br label %endblock
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
   ret i32 %call
@@ -60,9 +134,28 @@
 }
 
 define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp8(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp8(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+; X64-LABEL: @cmp8(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* %x to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* %y to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = sub i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; X64-NEXT:    br i1 [[TMP7]], label %res_block, label %endblock
+; X64:       res_block:
+; X64-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1
+; X64-NEXT:    br label %endblock
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP9]], %res_block ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
   ret i32 %call
@@ -142,8 +235,13 @@
 
 define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq2(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* %x to i16*
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* %y to i16*
+; ALL-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]]
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
+; ALL-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -168,8 +266,13 @@
 
 define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq4(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* %x to i32*
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* %y to i32*
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; ALL-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -219,11 +322,22 @@
 }
 
 define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq8(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq8(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq8(
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* %x to i64*
+; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* %y to i64*
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
+; X64-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
   %cmp = icmp eq i32 %call, 0
Index: test/Transforms/IndVarSimplify/huge_muls.ll
===================================================================
--- /dev/null
+++ test/Transforms/IndVarSimplify/huge_muls.ll
@@ -0,0 +1,87 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test takes excessively long time if SCEV tries to construct huge
+; SCEVMulExpr's (with ~1000 ops) due to non-linear analysis cost.
+define i32 @test() {
+; CHECK-LABEL: @test(
+bci_0:
+  br label %bci_12
+
+bci_133:                                          ; preds = %bci_127.unr-lcssa
+  ret i32 %tmp17
+
+bci_12:                                           ; preds = %bci_127.unr-lcssa, %bci_0
+  %indvars.iv184 = phi i64 [ %indvars.iv.next185, %bci_127.unr-lcssa ], [ 3, %bci_0 ]
+  %tmp1 = trunc i64 %indvars.iv184 to i32
+  br label %bci_55.postloop
+
+bci_127.unr-lcssa:                                ; preds = %bci_90.postloop
+  %indvars.iv.next185 = add nuw nsw i64 %indvars.iv184, 1
+  %tmp4 = icmp sgt i64 %indvars.iv184, 91
+  br i1 %tmp4, label %bci_133, label %bci_12
+
+bci_55.postloop:                                  ; preds = %bci_90.postloop, %bci_12
+  %indvars.iv180.postloop = phi i64 [ %indvars.iv.next181.postloop, %bci_90.postloop ], [ 15, %bci_12 ]
+  %local_2_16.postloop = phi i32 [ %tmp17, %bci_90.postloop ], [ 4, %bci_12 ]
+  %indvars.iv.next181.postloop = add nuw nsw i64 %indvars.iv180.postloop, 1
+  %tmp6 = load i32, i32 addrspace(1)* undef, align 4
+  %tmp7 = mul i32 %tmp6, %tmp1
+  br label %not_zero65.us.postloop
+
+not_zero65.us.postloop:                           ; preds = %not_zero65.us.postloop.1, %bci_55.postloop
+  %local_2_24.us.postloop = phi i32 [ %local_2_16.postloop, %bci_55.postloop ], [ %tmp49, %not_zero65.us.postloop.1 ]
+  %local_6_.us.postloop = phi i32 [ 3, %bci_55.postloop ], [ %tmp50, %not_zero65.us.postloop.1 ]
+  %tmp8 = mul i32 %tmp7, %local_2_24.us.postloop
+  %tmp9 = mul i32 %tmp8, %local_2_24.us.postloop
+  %tmp10 = mul i32 %tmp7, %tmp9
+  %tmp11 = mul i32 %tmp10, %tmp9
+  %tmp12 = mul i32 %tmp7, %tmp11
+  %tmp13 = mul i32 %tmp12, %tmp11
+  %tmp14 = mul i32 %tmp7, %tmp13
+  %tmp15 = mul i32 %tmp14, %tmp13
+  %tmp16 = mul i32 %tmp7, %tmp15
+  %tmp17 = mul i32 %tmp16, %tmp15
+  %tmp18 = icmp sgt i32 %local_6_.us.postloop, 82
+  br i1 %tmp18, label %bci_90.postloop, label %not_zero65.us.postloop.1
+
+bci_90.postloop:                                  ; preds = %not_zero65.us.postloop
+  %tmp19 = icmp sgt i64 %indvars.iv180.postloop, 68
+  br i1 %tmp19, label %bci_127.unr-lcssa, label %bci_55.postloop
+
+not_zero65.us.postloop.1:                         ; preds = %not_zero65.us.postloop
+  %tmp20 = mul i32 %tmp7, %tmp17
+  %tmp21 = mul i32 %tmp20, %tmp17
+  %tmp22 = mul i32 %tmp7, %tmp21
+  %tmp23 = mul i32 %tmp22, %tmp21
+  %tmp24 = mul i32 %tmp7, %tmp23
+  %tmp25 = mul i32 %tmp24, %tmp23
+  %tmp26 = mul i32 %tmp7, %tmp25
+  %tmp27 = mul i32 %tmp26, %tmp25
+  %tmp28 = mul i32 %tmp7, %tmp27
+  %tmp29 = mul i32 %tmp28, %tmp27
+  %tmp30 = mul i32 %tmp7, %tmp29
+  %tmp31 = mul i32 %tmp30, %tmp29
+  %tmp32 = mul i32 %tmp7, %tmp31
+  %tmp33 = mul i32 %tmp32, %tmp31
+  %tmp34 = mul i32 %tmp7, %tmp33
+  %tmp35 = mul i32 %tmp34, %tmp33
+  %tmp36 = mul i32 %tmp7, %tmp35
+  %tmp37 = mul i32 %tmp36, %tmp35
+  %tmp38 = mul i32 %tmp7, %tmp37
+  %tmp39 = mul i32 %tmp38, %tmp37
+  %tmp40 = mul i32 %tmp7, %tmp39
+  %tmp41 = mul i32 %tmp40, %tmp39
+  %tmp42 = mul i32 %tmp7, %tmp41
+  %tmp43 = mul i32 %tmp42, %tmp41
+  %tmp44 = mul i32 %tmp7, %tmp43
+  %tmp45 = mul i32 %tmp44, %tmp43
+  %tmp46 = mul i32 %tmp7, %tmp45
+  %tmp47 = mul i32 %tmp46, %tmp45
+  %tmp48 = mul i32 %tmp7, %tmp47
+  %tmp49 = mul i32 %tmp48, %tmp47
+  %tmp50 = add nsw i32 %local_6_.us.postloop, 20
+  br label %not_zero65.us.postloop
+}
Index: test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
@@ -0,0 +1,34 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s  -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s  -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s  -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s
+
+; GCN-LABEL: @vectorize_v2f16_loop(
+; GFX9: vector.body:
+; GFX9: phi <2 x half>
+; GFX9: load <2 x half>
+; GFX9: fadd fast <2 x half>
+
+; GFX9: middle.block:
+; GFX9: fadd fast <2 x half>
+
+; VI: phi half
+; VI: phi load half
+; VI: fadd fast half
+define half @vectorize_v2f16_loop(half addrspace(1)* noalias %s) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %q.04 = phi half [ 0.0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds half, half addrspace(1)* %s, i64 %indvars.iv
+  %0 = load half, half addrspace(1)* %arrayidx, align 2
+  %add = fadd fast half %q.04, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  %add.lcssa = phi half [ %add, %for.body ]
+  ret half %add.lcssa
+}
Index: test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
===================================================================
--- /dev/null
+++ test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
@@ -0,0 +1,195 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
+
+; FIXME: Should still like to vectorize the memory operations for VI
+
+; Simple 3-pair chain with loads and stores
+; GCN-LABEL: @test1_as_3_3_3_v2f16(
+; GFX9: load <2 x half>, <2 x half> addrspace(3)*
+; GFX9: load <2 x half>, <2 x half> addrspace(3)*
+; GFX9: fmul <2 x half>
+; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
+; GFX9: ret
+
+; VI: load half
+; VI: load half
+define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %mul = fmul half %i0, %i1
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %mul5 = fmul half %i3, %i4
+  store half %mul, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %mul5, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_as_3_0_0(
+; GFX9: load <2 x half>, <2 x half> addrspace(3)*
+; GFX9: load <2 x half>, <2 x half>*
+; GFX9: fmul <2 x half>
+; GFX9: store <2 x half> %{{.*}}, <2 x half>* %
+; GFX9: ret
+
+; VI: load half
+; VI: load half
+define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half* %b, align 2
+  %mul = fmul half %i0, %i1
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
+  %i4 = load half, half* %arrayidx4, align 2
+  %mul5 = fmul half %i3, %i4
+  store half %mul, half* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half* %c, i64 1
+  store half %mul5, half* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_as_0_0_3_v2f16(
+; GFX9: load <2 x half>, <2 x half>*
+; GFX9: load <2 x half>, <2 x half>*
+; GFX9: fmul <2 x half>
+; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
+; GFX9: ret
+
+; VI: load half
+; VI: load half
+define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
+  %i0 = load half, half* %a, align 2
+  %i1 = load half, half* %b, align 2
+  %mul = fmul half %i0, %i1
+  %arrayidx3 = getelementptr inbounds half, half* %a, i64 1
+  %i3 = load half, half* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
+  %i4 = load half, half* %arrayidx4, align 2
+  %mul5 = fmul half %i3, %i4
+  store half %mul, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %mul5, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_fma_v2f16(
+; GFX9: load <2 x half>
+; GFX9: load <2 x half>
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %i2 = load half, half addrspace(3)* %c, align 2
+  %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
+  store half %fma0, half addrspace(3)* %d, align 2
+  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+  store half %fma1, half addrspace(3)* %arrayidx6, align 2
+  ret void
+}
+
+; GCN-LABEL: @mul_scalar_v2f16(
+; GFX9: load <2 x half>
+; GFX9: fmul <2 x half>
+; GFX9: store <2 x half>
+define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %mul = fmul half %i0, %scalar
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %mul5 = fmul half %i3, %scalar
+  store half %mul, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %mul5, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @fabs_v2f16
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fabs.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %fabs0 = call half @llvm.fabs.f16(half %i0)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %fabs1 = call half @llvm.fabs.f16(half %i3)
+  store half %fabs0, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %fabs1, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_fabs_fma_v2f16(
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fabs.v2f16(
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %i2 = load half, half addrspace(3)* %c, align 2
+  %i0.fabs = call half @llvm.fabs.f16(half %i0)
+
+  %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+  %i3.fabs = call half @llvm.fabs.f16(half %i3)
+
+  %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
+  store half %fma0, half addrspace(3)* %d, align 2
+  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+  store half %fma1, half addrspace(3)* %arrayidx6, align 2
+  ret void
+}
+
+; FIXME: Should do vector load and extract component for fabs
+; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
+; GFX9: load half
+; GFX9: call half @llvm.fabs.f16(
+; GFX9: load <2 x half>
+; GFX9: load half
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %i2 = load half, half addrspace(3)* %c, align 2
+  %i1.fabs = call half @llvm.fabs.f16(half %i1)
+
+  %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
+  store half %fma0, half addrspace(3)* %d, align 2
+  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+  store half %fma1, half addrspace(3)* %arrayidx6, align 2
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.fma.f16(half, half, half) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Index: test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
===================================================================
--- test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -slp-vectorizer -dce < %s | FileCheck %s
-; XFAIL: *
-; 
-; FIXME: If this test expects to be vectorized, the TTI must indicate that the target
-;        has vector registers of the expected width.
-;        Currently, it says there are 8 vector registers that are 32-bits wide.
-
-target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
-
-
-; Simple 3-pair chain with loads and stores
-define amdgpu_kernel void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
-; CHECK-LABEL: @test1_as_3_3_3(
-; CHECK: load <2 x double>, <2 x double> addrspace(3)*
-; CHECK: load <2 x double>, <2 x double> addrspace(3)*
-; CHECK: store <2 x double> %{{.*}}, <2 x double> addrspace(3)* %
-; CHECK: ret
-  %i0 = load double, double addrspace(3)* %a, align 8
-  %i1 = load double, double addrspace(3)* %b, align 8
-  %mul = fmul double %i0, %i1
-  %arrayidx3 = getelementptr inbounds double, double addrspace(3)* %a, i64 1
-  %i3 = load double, double addrspace(3)* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double addrspace(3)* %b, i64 1
-  %i4 = load double, double addrspace(3)* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  store double %mul, double addrspace(3)* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double addrspace(3)* %c, i64 1
-  store double %mul5, double addrspace(3)* %arrayidx5, align 8
-  ret void
-}
-
-define amdgpu_kernel void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
-; CHECK-LABEL: @test1_as_3_0_0(
-; CHECK: load <2 x double>, <2 x double> addrspace(3)*
-; CHECK: load <2 x double>, <2 x double>*
-; CHECK: store <2 x double> %{{.*}}, <2 x double>* %
-; CHECK: ret
-  %i0 = load double, double addrspace(3)* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %arrayidx3 = getelementptr inbounds double, double addrspace(3)* %a, i64 1
-  %i3 = load double, double addrspace(3)* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  store double %mul, double* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %mul5, double* %arrayidx5, align 8
-  ret void
-}
-
-define amdgpu_kernel void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
-; CHECK-LABEL: @test1_as_0_0_3(
-; CHECK: load <2 x double>, <2 x double>*
-; CHECK: load <2 x double>, <2 x double>*
-; CHECK: store <2 x double> %{{.*}}, <2 x double> addrspace(3)* %
-; CHECK: ret
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  store double %mul, double addrspace(3)* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double addrspace(3)* %c, i64 1
-  store double %mul5, double addrspace(3)* %arrayidx5, align 8
-  ret void
-}
Index: test/tools/llvm-objdump/X86/macho-info-plist.test
===================================================================
--- test/tools/llvm-objdump/X86/macho-info-plist.test
+++ test/tools/llvm-objdump/X86/macho-info-plist.test
@@ -1,7 +1,11 @@
 # RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -info-plist - | FileCheck %s
+# RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -info-plist -no-leading-headers - | FileCheck --check-prefix=NOHEADER %s
 
 .section __TEXT, __info_plist
 .asciz "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
 
 # CHECK: Contents of (__TEXT,__info_plist) section
 # CHECK: <?xml version="1.0" encoding="UTF-8"?>
+
+# NOHEADER-NOT: Contents of (__TEXT,__info_plist) section
+# NOHEADER: <?xml version="1.0" encoding="UTF-8"?>
Index: test/tools/llvm-objdump/X86/macho-objc-meta-data.test
===================================================================
--- test/tools/llvm-objdump/X86/macho-objc-meta-data.test
+++ test/tools/llvm-objdump/X86/macho-objc-meta-data.test
@@ -1042,7 +1042,7 @@
 OBJC2_64BIT_DYLIB: Contents of (__DATA_CONST,__objc_classlist) section
 OBJC2_64BIT_DYLIB: 000000000000c038 0x8030 _OBJC_CLASS_$_Test
 OBJC2_64BIT_DYLIB:            isa 0x8008 _OBJC_METACLASS_$_Test
-OBJC2_64BIT_DYLIB:     superclass 0x0
+OBJC2_64BIT_DYLIB:     superclass 0x0 _OBJC_CLASS_$_NSObject
 OBJC2_64BIT_DYLIB:          cache 0x0
 OBJC2_64BIT_DYLIB:         vtable 0x0
 OBJC2_64BIT_DYLIB:           data 0xc120 (struct class_ro_t *)
@@ -1081,7 +1081,7 @@
 OBJC2_64BIT_DYLIB: 			attributes 0x4f4b TQ,V_testProp
 OBJC2_64BIT_DYLIB: Meta Class
 OBJC2_64BIT_DYLIB:            isa 0x0
-OBJC2_64BIT_DYLIB:     superclass 0x0
+OBJC2_64BIT_DYLIB:     superclass 0x0 _OBJC_METACLASS_$_NSObject
 OBJC2_64BIT_DYLIB:          cache 0x0
 OBJC2_64BIT_DYLIB:         vtable 0x0
 OBJC2_64BIT_DYLIB:           data 0xc048 (struct class_ro_t *)
Index: tools/llvm-cvtres/llvm-cvtres.cpp
===================================================================
--- tools/llvm-cvtres/llvm-cvtres.cpp
+++ tools/llvm-cvtres/llvm-cvtres.cpp
@@ -37,7 +37,7 @@
 enum ID {
   OPT_INVALID = 0, // This is not an option ID.
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR)                                              \
+               HELPTEXT, METAVAR, VALUES)                                      \
   OPT_##ID,
 #include "Opts.inc"
 #undef OPTION
@@ -49,12 +49,12 @@
 
 static const opt::OptTable::Info InfoTable[] = {
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR)                                              \
+               HELPTEXT, METAVAR, VALUES)                                      \
   {                                                                            \
-      PREFIX,      NAME,     HELPTEXT,                                         \
-      METAVAR,     OPT_##ID, opt::Option::KIND##Class,                         \
-      PARAM,       FLAGS,    OPT_##GROUP,                                      \
-      OPT_##ALIAS, ALIASARGS},
+      PREFIX,      NAME,      HELPTEXT,                                        \
+      METAVAR,     OPT_##ID,  opt::Option::KIND##Class,                        \
+      PARAM,       FLAGS,     OPT_##GROUP,                                     \
+      OPT_##ALIAS, ALIASARGS, VALUES},
 #include "Opts.inc"
 #undef OPTION
 };
Index: tools/llvm-objdump/MachODump.cpp
===================================================================
--- tools/llvm-objdump/MachODump.cpp
+++ tools/llvm-objdump/MachODump.cpp
@@ -1135,7 +1135,8 @@
     DataRefImpl Ref = Section.getRawDataRefImpl();
     StringRef SegName = O->getSectionFinalSegmentName(Ref);
     if (SegName == "__TEXT" && SectName == "__info_plist") {
-      outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
+      if (!NoLeadingHeaders)
+        outs() << "Contents of (" << SegName << "," << SectName << ") section\n";
       StringRef BytesStr;
       Section.getContents(BytesStr);
       const char *sect = reinterpret_cast<const char *>(BytesStr.data());
@@ -4572,6 +4573,12 @@
                        n_value, c.superclass);
   if (name != nullptr)
     outs() << " " << name;
+  else {
+    name = get_dyld_bind_info_symbolname(S.getAddress() +
+             offset + offsetof(struct class64_t, superclass), info);
+    if (name != nullptr)
+      outs() << " " << name;
+  }
   outs() << "\n";
 
   outs() << "         cache " << format("0x%" PRIx64, c.cache);
Index: tools/llvm-pdbutil/Diff.cpp
===================================================================
--- tools/llvm-pdbutil/Diff.cpp
+++ tools/llvm-pdbutil/Diff.cpp
@@ -198,17 +198,6 @@
                         File2.getBlockCount());
   Diffs |= diffAndPrint("Unknown 1", File1, File2, File1.getUnknown1(),
                         File2.getUnknown1());
-
-  if (opts::diff::Pedantic) {
-    Diffs |= diffAndPrint("Free Block Map", File1, File2,
-                          File1.getFreeBlockMapBlock(),
-                          File2.getFreeBlockMapBlock());
-    Diffs |= diffAndPrint("Directory Size", File1, File2,
-                          File1.getNumDirectoryBytes(),
-                          File2.getNumDirectoryBytes());
-    Diffs |= diffAndPrint("Block Map Addr", File1, File2,
-                          File1.getBlockMapOffset(), File2.getBlockMapOffset());
-  }
   if (!Diffs)
     outs() << "MSF Super Block: No differences detected...\n";
   return Error::success();
@@ -222,114 +211,72 @@
   outs() << "Stream Directory: Searching for differences...\n";
 
   bool HasDifferences = false;
-  if (opts::diff::Pedantic) {
-    size_t Min = std::min(P.size(), Q.size());
-    for (size_t I = 0; I < Min; ++I) {
-      StringRef Names[] = {P[I], Q[I]};
-      uint32_t Sizes[] = {File1.getStreamByteSize(I),
-                          File2.getStreamByteSize(I)};
-      bool NamesDiffer = Names[0] != Names[1];
-      bool SizesDiffer = Sizes[0] != Sizes[1];
-      if (NamesDiffer) {
-        HasDifferences = true;
-        outs().indent(2) << formatv("Stream {0} - {1}: {2}, {3}: {4}\n", I,
-                                    File1.getFilePath(), Names[0],
-                                    File2.getFilePath(), Names[1]);
-        continue;
-      }
-      if (SizesDiffer) {
-        HasDifferences = true;
-        outs().indent(2) << formatv(
-            "Stream {0} ({1}): {2}: {3} bytes, {4}: {5} bytes\n", I, Names[0],
-            File1.getFilePath(), Sizes[0], File2.getFilePath(), Sizes[1]);
-        continue;
-      }
-    }
+  auto PI = to_vector<32>(enumerate(P));
+  auto QI = to_vector<32>(enumerate(Q));
 
-    ArrayRef<std::string> MaxNames = (P.size() > Q.size() ? P : Q);
-    size_t Max = std::max(P.size(), Q.size());
-    PDBFile &MaxFile = (P.size() > Q.size() ? File1 : File2);
-    StringRef MinFileName =
-        (P.size() < Q.size() ? File1.getFilePath() : File2.getFilePath());
-    for (size_t I = Min; I < Max; ++I) {
-      HasDifferences = true;
-      StringRef StreamName = MaxNames[I];
-
-      outs().indent(2) << formatv(
-          "Stream {0} - {1}: <not present>, {2}: Index {3}, {4} bytes\n",
-          StreamName, MinFileName, MaxFile.getFilePath(), I,
-          MaxFile.getStreamByteSize(I));
-    }
-    if (!HasDifferences)
-      outs() << "Stream Directory: No differences detected...\n";
-  } else {
-    auto PI = to_vector<32>(enumerate(P));
-    auto QI = to_vector<32>(enumerate(Q));
-
-    typedef decltype(PI) ContainerType;
-    typedef typename ContainerType::value_type value_type;
-
-    auto Comparator = [](const value_type &I1, const value_type &I2) {
-      return I1.value() < I2.value();
-    };
-
-    decltype(PI) OnlyP;
-    decltype(QI) OnlyQ;
-    decltype(PI) Common;
-
-    set_differences(PI, QI, &OnlyP, &OnlyQ, &Common, Comparator);
-
-    if (!OnlyP.empty()) {
-      HasDifferences = true;
-      outs().indent(2) << formatv("{0} Stream(s) only in ({1})\n", OnlyP.size(),
-                                  File1.getFilePath());
-      for (auto &Item : OnlyP) {
-        outs().indent(4) << formatv("Stream {0} - {1}\n", Item.index(),
-                                    Item.value());
-      }
+  typedef decltype(PI) ContainerType;
+  typedef typename ContainerType::value_type value_type;
+
+  auto Comparator = [](const value_type &I1, const value_type &I2) {
+    return I1.value() < I2.value();
+  };
+
+  decltype(PI) OnlyP;
+  decltype(QI) OnlyQ;
+  decltype(PI) Common;
+
+  set_differences(PI, QI, &OnlyP, &OnlyQ, &Common, Comparator);
+
+  if (!OnlyP.empty()) {
+    HasDifferences = true;
+    outs().indent(2) << formatv("{0} Stream(s) only in ({1})\n", OnlyP.size(),
+                                File1.getFilePath());
+    for (auto &Item : OnlyP) {
+      outs().indent(4) << formatv("Stream {0} - {1}\n", Item.index(),
+                                  Item.value());
     }
+  }
 
-    if (!OnlyQ.empty()) {
-      HasDifferences = true;
-      outs().indent(2) << formatv("{0} Streams(s) only in ({1})\n",
-                                  OnlyQ.size(), File2.getFilePath());
-      for (auto &Item : OnlyQ) {
-        outs().indent(4) << formatv("Stream {0} - {1}\n", Item.index(),
-                                    Item.value());
-      }
+  if (!OnlyQ.empty()) {
+    HasDifferences = true;
+    outs().indent(2) << formatv("{0} Streams(s) only in ({1})\n", OnlyQ.size(),
+                                File2.getFilePath());
+    for (auto &Item : OnlyQ) {
+      outs().indent(4) << formatv("Stream {0} - {1}\n", Item.index(),
+                                  Item.value());
     }
-    if (!Common.empty()) {
-      outs().indent(2) << formatv("Found {0} common streams.  Searching for "
-                                  "intra-stream differences.\n",
-                                  Common.size());
-      bool HasCommonDifferences = false;
-      for (const auto &Left : Common) {
-        // Left was copied from the first range so its index refers to a stream
-        // index in the first file.  Find the corresponding stream index in the
-        // second file.
-        auto Range =
-            std::equal_range(QI.begin(), QI.end(), Left,
-                             [](const value_type &L, const value_type &R) {
-                               return L.value() < R.value();
-                             });
-        const auto &Right = *Range.first;
-        assert(Left.value() == Right.value());
-        uint32_t LeftSize = File1.getStreamByteSize(Left.index());
-        uint32_t RightSize = File2.getStreamByteSize(Right.index());
-        if (LeftSize != RightSize) {
-          HasDifferences = true;
-          HasCommonDifferences = true;
-          outs().indent(4) << formatv("{0} ({1}: {2} bytes, {3}: {4} bytes)\n",
-                                      Left.value(), File1.getFilePath(),
-                                      LeftSize, File2.getFilePath(), RightSize);
-        }
+  }
+  if (!Common.empty()) {
+    outs().indent(2) << formatv("Found {0} common streams.  Searching for "
+                                "intra-stream differences.\n",
+                                Common.size());
+    bool HasCommonDifferences = false;
+    for (const auto &Left : Common) {
+      // Left was copied from the first range so its index refers to a stream
+      // index in the first file.  Find the corresponding stream index in the
+      // second file.
+      auto Range =
+          std::equal_range(QI.begin(), QI.end(), Left,
+                           [](const value_type &L, const value_type &R) {
+                             return L.value() < R.value();
+                           });
+      const auto &Right = *Range.first;
+      assert(Left.value() == Right.value());
+      uint32_t LeftSize = File1.getStreamByteSize(Left.index());
+      uint32_t RightSize = File2.getStreamByteSize(Right.index());
+      if (LeftSize != RightSize) {
+        HasDifferences = true;
+        HasCommonDifferences = true;
+        outs().indent(4) << formatv("{0} ({1}: {2} bytes, {3}: {4} bytes)\n",
+                                    Left.value(), File1.getFilePath(), LeftSize,
+                                    File2.getFilePath(), RightSize);
       }
-      if (!HasCommonDifferences)
-        outs().indent(2) << "Common Streams:  No differences detected!\n";
     }
-    if (!HasDifferences)
-      outs() << "Stream Directory: No differences detected!\n";
+    if (!HasCommonDifferences)
+      outs().indent(2) << "Common Streams:  No differences detected!\n";
   }
+  if (!HasDifferences)
+    outs() << "Stream Directory: No differences detected!\n";
 
   return Error::success();
 }
@@ -384,77 +331,39 @@
 
   auto IdList1 = ST1.name_ids();
   auto IdList2 = ST2.name_ids();
-  if (opts::diff::Pedantic) {
-    // In pedantic mode, we compare index by index (i.e. the strings are in the
-    // same order
-    // in both tables.
-    uint32_t Max = std::max(IdList1.size(), IdList2.size());
-    for (uint32_t I = 0; I < Max; ++I) {
-      Optional<uint32_t> Id1, Id2;
-      StringRef S1, S2;
-      if (I < IdList1.size()) {
-        Id1 = IdList1[I];
-        if (auto Result = ST1.getStringForID(*Id1))
-          S1 = *Result;
-        else
-          return Result.takeError();
-      }
-      if (I < IdList2.size()) {
-        Id2 = IdList2[I];
-        if (auto Result = ST2.getStringForID(*Id2))
-          S2 = *Result;
-        else
-          return Result.takeError();
-      }
-      if (Id1 == Id2 && S1 == S2)
-        continue;
-
-      std::string OutId1 =
-          Id1 ? formatv("{0}", *Id1).str() : "(index not present)";
-      std::string OutId2 =
-          Id2 ? formatv("{0}", *Id2).str() : "(index not present)";
-      outs() << formatv("  String {0}\n", I);
-      outs() << formatv("    {0}: Hash - {1}, Value - {2}\n",
-                        File1.getFilePath(), OutId1, S1);
-      outs() << formatv("    {0}: Hash - {1}, Value - {2}\n",
-                        File2.getFilePath(), OutId2, S2);
-      HasDiff = true;
-    }
-  } else {
-    std::vector<StringRef> Strings1, Strings2;
-    Strings1.reserve(IdList1.size());
-    Strings2.reserve(IdList2.size());
-    for (auto ID : IdList1) {
-      auto S = ST1.getStringForID(ID);
-      if (!S)
-        return S.takeError();
-      Strings1.push_back(*S);
-    }
-    for (auto ID : IdList2) {
-      auto S = ST2.getStringForID(ID);
-      if (!S)
-        return S.takeError();
-      Strings2.push_back(*S);
-    }
+  std::vector<StringRef> Strings1, Strings2;
+  Strings1.reserve(IdList1.size());
+  Strings2.reserve(IdList2.size());
+  for (auto ID : IdList1) {
+    auto S = ST1.getStringForID(ID);
+    if (!S)
+      return S.takeError();
+    Strings1.push_back(*S);
+  }
+  for (auto ID : IdList2) {
+    auto S = ST2.getStringForID(ID);
+    if (!S)
+      return S.takeError();
+    Strings2.push_back(*S);
+  }
 
-    SmallVector<StringRef, 64> OnlyP;
-    SmallVector<StringRef, 64> OnlyQ;
-    auto End1 = std::remove(Strings1.begin(), Strings1.end(), "");
-    auto End2 = std::remove(Strings2.begin(), Strings2.end(), "");
-    uint32_t Empty1 = std::distance(End1, Strings1.end());
-    uint32_t Empty2 = std::distance(End2, Strings2.end());
-    Strings1.erase(End1, Strings1.end());
-    Strings2.erase(End2, Strings2.end());
-    set_differences(Strings1, Strings2, &OnlyP, &OnlyQ);
-    printSymmetricDifferences(File1, File2, OnlyP, OnlyQ, "String");
-
-    if (Empty1 != Empty2) {
-      PDBFile &MoreF = (Empty1 > Empty2) ? File1 : File2;
-      PDBFile &LessF = (Empty1 < Empty2) ? File1 : File2;
-      uint32_t Difference = AbsoluteDifference(Empty1, Empty2);
-      outs() << formatv("  {0} had {1} more empty strings than {2}\n",
-                        MoreF.getFilePath(), Difference, LessF.getFilePath());
-    }
+  SmallVector<StringRef, 64> OnlyP;
+  SmallVector<StringRef, 64> OnlyQ;
+  auto End1 = std::remove(Strings1.begin(), Strings1.end(), "");
+  auto End2 = std::remove(Strings2.begin(), Strings2.end(), "");
+  uint32_t Empty1 = std::distance(End1, Strings1.end());
+  uint32_t Empty2 = std::distance(End2, Strings2.end());
+  Strings1.erase(End1, Strings1.end());
+  Strings2.erase(End2, Strings2.end());
+  set_differences(Strings1, Strings2, &OnlyP, &OnlyQ);
+  printSymmetricDifferences(File1, File2, OnlyP, OnlyQ, "String");
+
+  if (Empty1 != Empty2) {
+    PDBFile &MoreF = (Empty1 > Empty2) ? File1 : File2;
+    PDBFile &LessF = (Empty1 < Empty2) ? File1 : File2;
+    uint32_t Difference = AbsoluteDifference(Empty1, Empty2);
+    outs() << formatv("  {0} had {1} more empty strings than {2}\n",
+                      MoreF.getFilePath(), Difference, LessF.getFilePath());
   }
   if (!HasDiff)
     outs() << "String Table: No differences detected!\n";
Index: tools/llvm-pdbutil/llvm-pdbutil.h
===================================================================
--- tools/llvm-pdbutil/llvm-pdbutil.h
+++ tools/llvm-pdbutil/llvm-pdbutil.h
@@ -127,10 +127,6 @@
 extern llvm::cl::opt<bool> RawAll;
 }
 
-namespace diff {
-extern llvm::cl::opt<bool> Pedantic;
-}
-
 namespace pdb2yaml {
 extern llvm::cl::opt<bool> All;
 extern llvm::cl::opt<bool> NoFileHeaders;
Index: tools/llvm-pdbutil/llvm-pdbutil.cpp
===================================================================
--- tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -256,11 +256,6 @@
 }
 
 namespace diff {
-cl::opt<bool> Pedantic("pedantic",
-                       cl::desc("Finds all differences (even structural ones "
-                                "that produce otherwise identical PDBs)"),
-                       cl::sub(DiffSubcommand));
-
 cl::list<std::string> InputFilenames(cl::Positional,
                                      cl::desc("<first> <second>"),
                                      cl::OneOrMore, cl::sub(DiffSubcommand));
Index: unittests/Option/OptionParsingTest.cpp
===================================================================
--- unittests/Option/OptionParsingTest.cpp
+++ unittests/Option/OptionParsingTest.cpp
@@ -18,8 +18,9 @@
 
 enum ID {
   OPT_INVALID = 0, // This is not an option ID.
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \
-               HELPTEXT, METAVAR) OPT_##ID,
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  OPT_##ID,
 #include "Opts.inc"
   LastOption
 #undef OPTION
@@ -36,10 +37,10 @@
 };
 
 static const OptTable::Info InfoTable[] = {
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \
-               HELPTEXT, METAVAR)   \
-  { PREFIX, NAME, HELPTEXT, METAVAR, OPT_##ID, Option::KIND##Class, PARAM, \
-    FLAGS, OPT_##GROUP, OPT_##ALIAS, ALIASARGS },
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {PREFIX, NAME,  HELPTEXT,    METAVAR,     OPT_##ID,  Option::KIND##Class,    \
+   PARAM,  FLAGS, OPT_##GROUP, OPT_##ALIAS, ALIASARGS, VALUES},
 #include "Opts.inc"
 #undef OPTION
 };
Index: unittests/Support/CommandLineTest.cpp
===================================================================
--- unittests/Support/CommandLineTest.cpp
+++ unittests/Support/CommandLineTest.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
 #include "llvm/Support/StringSaver.h"
 #include "gtest/gtest.h"
 #include <fstream>
@@ -546,6 +547,11 @@
   }
 }
 
+TEST(CommandLineTest, ArgumentLimit) {
+  std::string args(32 * 4096, 'a');
+  EXPECT_FALSE(llvm::sys::commandLineFitsWithinSystemLimits("cl", args.data()));
+}
+
 TEST(CommandLineTest, ResponseFiles) {
   llvm::SmallString<128> TestDir;
   std::error_code EC =
Index: unittests/Support/ErrorTest.cpp
===================================================================
--- unittests/Support/ErrorTest.cpp
+++ unittests/Support/ErrorTest.cpp
@@ -475,6 +475,10 @@
 
   int X = cantFail(Expected<int>(42));
   EXPECT_EQ(X, 42) << "Expected value modified by cantFail";
+
+  int Dummy = 42;
+  int &Y = cantFail(Expected<int&>(Dummy));
+  EXPECT_EQ(&Dummy, &Y) << "Reference mangled by cantFail";
 }
 
 // Test that cantFail results in a crash if you pass it a failure value.
Index: utils/TableGen/CodeGenDAGPatterns.cpp
===================================================================
--- utils/TableGen/CodeGenDAGPatterns.cpp
+++ utils/TableGen/CodeGenDAGPatterns.cpp
@@ -2762,8 +2762,8 @@
     AnalyzeNode(Pat->getTree(0));
   }
 
-  void Analyze(const PatternToMatch *Pat) {
-    AnalyzeNode(Pat->getSrcPattern());
+  void Analyze(const PatternToMatch &Pat) {
+    AnalyzeNode(Pat.getSrcPattern());
   }
 
 private:
@@ -3289,9 +3289,7 @@
 
   // Second, look for single-instruction patterns defined outside the
   // instruction.
-  for (ptm_iterator I = ptm_begin(), E = ptm_end(); I != E; ++I) {
-    const PatternToMatch &PTM = *I;
-
+  for (const PatternToMatch &PTM : ptms()) {
     // We can only infer from single-instruction patterns, otherwise we won't
     // know which instruction should get the flags.
     SmallVector<Record*, 8> PatInstrs;
@@ -3307,7 +3305,7 @@
       continue;
 
     InstAnalyzer PatInfo(*this);
-    PatInfo.Analyze(&PTM);
+    PatInfo.Analyze(PTM);
     Errors += InferFromPattern(InstInfo, PatInfo, PTM.getSrcRecord());
   }
 
@@ -3367,7 +3365,7 @@
 
     // Analyze the source pattern.
     InstAnalyzer PatInfo(*this);
-    PatInfo.Analyze(&PTM);
+    PatInfo.Analyze(PTM);
 
     // Collect error messages.
     SmallVector<std::string, 4> Msgs;
Index: utils/TableGen/OptParserEmitter.cpp
===================================================================
--- utils/TableGen/OptParserEmitter.cpp
+++ utils/TableGen/OptParserEmitter.cpp
@@ -196,6 +196,9 @@
       OS << ", nullptr";
 
     // The option meta-variable name (unused).
+    OS << ", nullptr";
+
+    // The option Values (unused for groups).
     OS << ", nullptr)\n";
   }
   OS << "\n";
@@ -285,6 +288,13 @@
     else
       OS << "nullptr";
 
+    // The option Values. Used for shell autocompletion.
+    OS << ", ";
+    if (!isa<UnsetInit>(R.getValueInit("Values")))
+      write_cstring(OS, R.getValueAsString("Values"));
+    else
+      OS << "nullptr";
+
     OS << ")\n";
   }
   OS << "#endif // OPTION\n";