Index: bindings/go/llvm/ir.go
===================================================================
--- bindings/go/llvm/ir.go
+++ bindings/go/llvm/ir.go
@@ -611,6 +611,12 @@
 }
 
 // Operations on array, pointer, and vector types (sequence types)
+func (t Type) Subtypes() (ret []Type) {
+	ret = make([]Type, C.LLVMGetNumContainedTypes(t.C))
+	C.LLVMGetSubtypes(t.C, llvmTypeRefPtr(&ret[0]))
+	return
+}
+
 func ArrayType(elementType Type, elementCount int) (t Type) {
 	t.C = C.LLVMArrayType(elementType.C, C.unsigned(elementCount))
 	return
Index: bindings/go/llvm/ir_test.go
===================================================================
--- bindings/go/llvm/ir_test.go
+++ bindings/go/llvm/ir_test.go
@@ -134,3 +134,29 @@
 		t.Errorf("Got metadata %v as scope, though wanted %v", loc.Scope.C, scope.C)
 	}
 }
+
+func TestSubtypes(t *testing.T) {
+	cont := NewContext()
+	defer cont.Dispose()
+
+	int_pointer := PointerType(cont.Int32Type(), 0)
+	int_inner := int_pointer.Subtypes()
+	if len(int_inner) != 1 {
+		t.Errorf("Got size %d, though wanted 1")
+	}
+	if int_inner[0] != cont.Int32Type() {
+		t.Errorf("Expected int32 type")
+	}
+
+	st_pointer := cont.StructType([]Type{cont.Int32Type(), cont.Int8Type()}, false)
+	st_inner := st_pointer.Subtypes()
+	if len(st_inner) != 2 {
+		t.Errorf("Got size %d, though wanted 2")
+	}
+	if st_inner[0] != cont.Int32Type() {
+		t.Errorf("Expected first struct field to be int32")
+	}
+	if st_inner[1] != cont.Int8Type() {
+		t.Errorf("Expected second struct field to be int8")
+	}
+}
Index: docs/ReleaseNotes.rst
===================================================================
--- docs/ReleaseNotes.rst
+++ docs/ReleaseNotes.rst
@@ -43,6 +43,13 @@
 * LLVM's ``WeakVH`` has been renamed to ``WeakTrackingVH`` and a new ``WeakVH``
   has been introduced.  The new ``WeakVH`` nulls itself out on deletion, but
   does not track values across RAUW.
+  
+* A new library named ``BinaryFormat`` has been created which holds a collection
+  of code which previously lived in ``Support``.  This includes the
+  ``file_magic`` structure and ``identify_magic`` functions, as well as all the
+  structure and type definitions for DWARF, ELF, COFF, WASM, and MachO file
+  formats.
+  
 
 * ... next change ...
 
Index: include/llvm/Analysis/BranchProbabilityInfo.h
===================================================================
--- include/llvm/Analysis/BranchProbabilityInfo.h
+++ include/llvm/Analysis/BranchProbabilityInfo.h
@@ -26,6 +26,7 @@
 
 namespace llvm {
 class LoopInfo;
+class TargetLibraryInfo;
 class raw_ostream;
 
 /// \brief Analysis providing branch probability information.
@@ -43,8 +44,9 @@
 class BranchProbabilityInfo {
 public:
   BranchProbabilityInfo() {}
-  BranchProbabilityInfo(const Function &F, const LoopInfo &LI) {
-    calculate(F, LI);
+  BranchProbabilityInfo(const Function &F, const LoopInfo &LI,
+                        const TargetLibraryInfo *TLI = nullptr) {
+    calculate(F, LI, TLI);
   }
 
   BranchProbabilityInfo(BranchProbabilityInfo &&Arg)
@@ -116,7 +118,8 @@
     return IsLikely ? LikelyProb : LikelyProb.getCompl();
   }
 
-  void calculate(const Function &F, const LoopInfo &LI);
+  void calculate(const Function &F, const LoopInfo &LI,
+                 const TargetLibraryInfo *TLI = nullptr);
 
   /// Forget analysis results for the given basic block.
   void eraseBlock(const BasicBlock *BB);
@@ -171,7 +174,7 @@
   bool calcColdCallHeuristics(const BasicBlock *BB);
   bool calcPointerHeuristics(const BasicBlock *BB);
   bool calcLoopBranchHeuristics(const BasicBlock *BB, const LoopInfo &LI);
-  bool calcZeroHeuristics(const BasicBlock *BB);
+  bool calcZeroHeuristics(const BasicBlock *BB, const TargetLibraryInfo *TLI);
   bool calcFloatingPointHeuristics(const BasicBlock *BB);
   bool calcInvokeHeuristics(const BasicBlock *BB);
 };
Index: include/llvm/Analysis/LazyBranchProbabilityInfo.h
===================================================================
--- include/llvm/Analysis/LazyBranchProbabilityInfo.h
+++ include/llvm/Analysis/LazyBranchProbabilityInfo.h
@@ -24,6 +24,7 @@
 class AnalysisUsage;
 class Function;
 class LoopInfo;
+class TargetLibraryInfo;
 
 /// \brief This is an alternative analysis pass to
 /// BranchProbabilityInfoWrapperPass.  The difference is that with this pass the
@@ -55,14 +56,15 @@
   /// analysis without paying for the overhead if BPI doesn't end up being used.
   class LazyBranchProbabilityInfo {
   public:
-    LazyBranchProbabilityInfo(const Function *F, const LoopInfo *LI)
-        : Calculated(false), F(F), LI(LI) {}
+    LazyBranchProbabilityInfo(const Function *F, const LoopInfo *LI,
+                              const TargetLibraryInfo *TLI)
+        : Calculated(false), F(F), LI(LI), TLI(TLI) {}
 
     /// Retrieve the BPI with the branch probabilities computed.
     BranchProbabilityInfo &getCalculated() {
       if (!Calculated) {
         assert(F && LI && "call setAnalysis");
-        BPI.calculate(*F, *LI);
+        BPI.calculate(*F, *LI, TLI);
         Calculated = true;
       }
       return BPI;
@@ -77,6 +79,7 @@
     bool Calculated;
     const Function *F;
     const LoopInfo *LI;
+    const TargetLibraryInfo *TLI;
   };
 
   std::unique_ptr<LazyBranchProbabilityInfo> LBPI;
Index: include/llvm/Analysis/TargetLibraryInfo.h
===================================================================
--- include/llvm/Analysis/TargetLibraryInfo.h
+++ include/llvm/Analysis/TargetLibraryInfo.h
@@ -13,6 +13,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
@@ -239,6 +240,13 @@
     return Impl->getLibFunc(FDecl, F);
   }
 
+  /// If a callsite does not have the 'nobuiltin' attribute, return if the
+  /// called function is a known library function and set F to that function.
+  bool getLibFunc(ImmutableCallSite CS, LibFunc &F) const {
+    return !CS.isNoBuiltin() && CS.getCalledFunction() &&
+           getLibFunc(*(CS.getCalledFunction()), F);
+  }
+
   /// Tests whether a library function is available.
   bool has(LibFunc F) const {
     return Impl->getState(F) != TargetLibraryInfoImpl::Unavailable;
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -1092,10 +1092,10 @@
   ///            \----------------v-------------/  \----------v------------/
   ///                            n/2 elements               n/2 elements
   /// %red1 = op <n x t> %val, <n x t> val1
-  /// After this operation we have a vector %red1 with only maningfull the
-  /// first n/2 elements, the second n/2 elements are undefined and can be
+  /// After this operation we have a vector %red1 where only the first n/2
+  /// elements are meaningful, the second n/2 elements are undefined and can be
   /// dropped. All other operations are actually working with the vector of
-  /// length n/2, not n. though the real vector length is still n.
+  /// length n/2, not n, though the real vector length is still n.
   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
   ///            \----------------v-------------/  \----------v------------/
Index: include/llvm/DebugInfo/PDB/PDBSymbol.h
===================================================================
--- include/llvm/DebugInfo/PDB/PDBSymbol.h
+++ include/llvm/DebugInfo/PDB/PDBSymbol.h
@@ -89,6 +89,8 @@
 
   template <typename T> std::unique_ptr<T> findOneChild() const {
     auto Enumerator(findAllChildren<T>());
+    if (!Enumerator)
+      return nullptr;
     return Enumerator->getNext();
   }
 
@@ -97,6 +99,8 @@
   template <typename T>
   std::unique_ptr<ConcreteSymbolEnumerator<T>> findAllChildren() const {
     auto BaseIter = RawSymbol->findChildren(T::Tag);
+    if (!BaseIter)
+      return nullptr;
     return llvm::make_unique<ConcreteSymbolEnumerator<T>>(std::move(BaseIter));
   }
   std::unique_ptr<IPDBEnumSymbols> findAllChildren(PDB_SymType Type) const;
Index: lib/Analysis/BranchProbabilityInfo.cpp
===================================================================
--- lib/Analysis/BranchProbabilityInfo.cpp
+++ lib/Analysis/BranchProbabilityInfo.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -30,6 +31,7 @@
 INITIALIZE_PASS_BEGIN(BranchProbabilityInfoWrapperPass, "branch-prob",
                       "Branch Probability Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(BranchProbabilityInfoWrapperPass, "branch-prob",
                     "Branch Probability Analysis", false, true)
 
@@ -457,7 +459,8 @@
   return true;
 }
 
-bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB) {
+bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB,
+                                               const TargetLibraryInfo *TLI) {
   const BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
   if (!BI || !BI->isConditional())
     return false;
@@ -480,8 +483,37 @@
         if (AndRHS->getUniqueInteger().isPowerOf2())
           return false;
 
+  // Check if the LHS is the return value of a library function
+  LibFunc Func = NumLibFuncs;
+  if (TLI)
+    if (CallInst *Call = dyn_cast<CallInst>(CI->getOperand(0)))
+      if (Function *CalledFn = Call->getCalledFunction())
+        TLI->getLibFunc(*CalledFn, Func);
+
   bool isProb;
-  if (CV->isZero()) {
+  if (Func == LibFunc_strcasecmp ||
+      Func == LibFunc_strcmp ||
+      Func == LibFunc_strncasecmp ||
+      Func == LibFunc_strncmp ||
+      Func == LibFunc_memcmp) {
+    // strcmp and similar functions return zero, negative, or positive, if the
+    // first string is equal, less, or greater than the second. We consider it
+    // likely that the strings are not equal, so a comparison with zero is
+    // probably false, but also a comparison with any other number is also
+    // probably false given that what exactly is returned for nonzero values is
+    // not specified. Any kind of comparison other than equality we know
+    // nothing about.
+    switch (CI->getPredicate()) {
+    case CmpInst::ICMP_EQ:
+      isProb = false;
+      break;
+    case CmpInst::ICMP_NE:
+      isProb = true;
+      break;
+    default:
+      return false;
+    }
+  } else if (CV->isZero()) {
     switch (CI->getPredicate()) {
     case CmpInst::ICMP_EQ:
       // X == 0   ->  Unlikely
@@ -707,7 +739,8 @@
   }
 }
 
-void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI) {
+void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
+                                      const TargetLibraryInfo *TLI) {
   DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName()
                << " ----\n\n");
   LastF = &F; // Store the last function we ran on for printing.
@@ -733,7 +766,7 @@
       continue;
     if (calcPointerHeuristics(BB))
       continue;
-    if (calcZeroHeuristics(BB))
+    if (calcZeroHeuristics(BB, TLI))
       continue;
     if (calcFloatingPointHeuristics(BB))
       continue;
@@ -747,12 +780,14 @@
 void BranchProbabilityInfoWrapperPass::getAnalysisUsage(
     AnalysisUsage &AU) const {
   AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
 bool BranchProbabilityInfoWrapperPass::runOnFunction(Function &F) {
   const LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  BPI.calculate(F, LI);
+  const TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  BPI.calculate(F, LI, &TLI);
   return false;
 }
 
@@ -767,7 +802,7 @@
 BranchProbabilityInfo
 BranchProbabilityAnalysis::run(Function &F, FunctionAnalysisManager &AM) {
   BranchProbabilityInfo BPI;
-  BPI.calculate(F, AM.getResult<LoopAnalysis>(F));
+  BPI.calculate(F, AM.getResult<LoopAnalysis>(F), &AM.getResult<TargetLibraryAnalysis>(F));
   return BPI;
 }
 
Index: lib/Analysis/LazyBranchProbabilityInfo.cpp
===================================================================
--- lib/Analysis/LazyBranchProbabilityInfo.cpp
+++ lib/Analysis/LazyBranchProbabilityInfo.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/Analysis/LazyBranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 
 using namespace llvm;
 
@@ -24,6 +25,7 @@
 INITIALIZE_PASS_BEGIN(LazyBranchProbabilityInfoPass, DEBUG_TYPE,
                       "Lazy Branch Probability Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(LazyBranchProbabilityInfoPass, DEBUG_TYPE,
                     "Lazy Branch Probability Analysis", true, true)
 
@@ -41,6 +43,7 @@
 
 void LazyBranchProbabilityInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
@@ -48,16 +51,19 @@
 
 bool LazyBranchProbabilityInfoPass::runOnFunction(Function &F) {
   LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  LBPI = llvm::make_unique<LazyBranchProbabilityInfo>(&F, &LI);
+  TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  LBPI = llvm::make_unique<LazyBranchProbabilityInfo>(&F, &LI, &TLI);
   return false;
 }
 
 void LazyBranchProbabilityInfoPass::getLazyBPIAnalysisUsage(AnalysisUsage &AU) {
   AU.addRequired<LazyBranchProbabilityInfoPass>();
   AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
 
 void llvm::initializeLazyBPIPassPass(PassRegistry &Registry) {
   INITIALIZE_PASS_DEPENDENCY(LazyBranchProbabilityInfoPass);
   INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
+  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass);
 }
Index: lib/Analysis/LazyValueInfo.cpp
===================================================================
--- lib/Analysis/LazyValueInfo.cpp
+++ lib/Analysis/LazyValueInfo.cpp
@@ -302,7 +302,7 @@
 ///   contradictory.  If this happens, we return some valid lattice value so as
 ///   not confuse the rest of LVI.  Ideally, we'd always return Undefined, but
 ///   we do not make this guarantee.  TODO: This would be a useful enhancement.
-static LVILatticeVal intersect(LVILatticeVal A, LVILatticeVal B) {
+static LVILatticeVal intersect(const LVILatticeVal &A, const LVILatticeVal &B) {
   // Undefined is the strongest state.  It means the value is known to be along
   // an unreachable path.
   if (A.isUndefined())
Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -235,12 +235,12 @@
     void eliminateMostlyEmptyBlock(BasicBlock *BB);
     bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
                                        bool isPreheader);
-    bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT);
-    bool optimizeInst(Instruction *I, bool& ModifiedDT);
+    bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
+    bool optimizeInst(Instruction *I, bool &ModifiedDT);
     bool optimizeMemoryInst(Instruction *I, Value *Addr,
                             Type *AccessTy, unsigned AS);
     bool optimizeInlineAsmInst(CallInst *CS);
-    bool optimizeCallInst(CallInst *CI, bool& ModifiedDT);
+    bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
     bool optimizeExt(Instruction *&I);
     bool optimizeExtUses(Instruction *I);
     bool optimizeLoadExt(LoadInst *I);
@@ -1675,6 +1675,7 @@
   void emitLoadCompareByteBlock(unsigned Index, int GEPIndex);
   void emitMemCmpResultBlock(bool IsLittleEndian);
   Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian);
+  Value *getMemCmpEqZeroOneBlock(unsigned Size);
   unsigned getLoadSize(unsigned Size);
   unsigned getNumLoads(unsigned Size);
 
@@ -1699,31 +1700,35 @@
                                  unsigned MaxLoadSize, unsigned LoadsPerBlock)
     : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock) {
 
-  IRBuilder<> Builder(CI->getContext());
-  BasicBlock *StartBlock = CI->getParent();
-  EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
-  setupEndBlockPHINodes();
+  // A memcmp with zero-comparison with only one block of load and compare does
+  // not need to set up any extra blocks. This case could be handled in the DAG,
+  // but since we have all of the machinery to flexibly expand any memcpy here,
+  // we choose to handle this case too to avoid fragmented lowering.
   IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
-
-  // Calculate how many load compare blocks are required for an expansion of
-  // given Size.
   NumBlocks = calculateNumBlocks(Size);
-  createResultBlock();
+  if (!IsUsedForZeroCmp || NumBlocks != 1) {
+    BasicBlock *StartBlock = CI->getParent();
+    EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
+    setupEndBlockPHINodes();
+    createResultBlock();
 
-  // If return value of memcmp is not used in a zero equality, we need to
-  // calculate which source was larger. The calculation requires the
-  // two loaded source values of each load compare block.
-  // These will be saved in the phi nodes created by setupResultBlockPHINodes.
-  if (!IsUsedForZeroCmp)
-    setupResultBlockPHINodes();
+    // If return value of memcmp is not used in a zero equality, we need to
+    // calculate which source was larger. The calculation requires the
+    // two loaded source values of each load compare block.
+    // These will be saved in the phi nodes created by setupResultBlockPHINodes.
+    if (!IsUsedForZeroCmp)
+      setupResultBlockPHINodes();
 
-  // Create the number of required load compare basic blocks.
-  createLoadCmpBlocks();
+    // Create the number of required load compare basic blocks.
+    createLoadCmpBlocks();
+
+    // Update the terminator added by splitBasicBlock to branch to the first
+    // LoadCmpBlock.
+    StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
+  }
 
-  // Update the terminator added by splitBasicBlock to branch to the first
-  // LoadCmpBlock.
+  IRBuilder<> Builder(CI->getContext());
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-  StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
 }
 
 void MemCmpExpansion::createLoadCmpBlocks() {
@@ -1810,7 +1815,12 @@
   unsigned NumLoadsRemaining = getNumLoads(RemainingBytes);
   unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock);
 
-  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+  // For a single-block expansion, start inserting before the memcmp call.
+  if (LoadCmpBlocks.empty())
+    Builder.SetInsertPoint(CI);
+  else
+    Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+
   Value *Cmp = nullptr;
   for (unsigned i = 0; i < NumLoads; ++i) {
     unsigned LoadSize = getLoadSize(RemainingBytes);
@@ -2071,11 +2081,22 @@
   return PhiRes;
 }
 
+/// A memcmp expansion that compares equality with 0 and only has one block of
+/// load and compare can bypass the compare, branch, and phi IR that is required
+/// in the general case.
+Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) {
+  unsigned NumBytesProcessed = 0;
+  IRBuilder<> Builder(CI->getContext());
+  Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed, Builder);
+  return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
+}
+
 // This function expands the memcmp call into an inline expansion and returns
 // the memcmp result.
 Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size, bool IsLittleEndian) {
   if (IsUsedForZeroCmp)
-    return getMemCmpExpansionZeroCase(Size, IsLittleEndian);
+    return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) :
+                            getMemCmpExpansionZeroCase(Size, IsLittleEndian);
 
   // This loop calls emitLoadCompareBlock for comparing Size bytes of the two
   // memcmp sources. It starts with loading using the maximum load size set by
@@ -2232,7 +2253,7 @@
   return true;
 }
 
-bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
+bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
   // Lower inline assembly if we can.
@@ -2385,12 +2406,10 @@
   }
 
   LibFunc Func;
-  if (TLInfo->getLibFunc(*CI->getCalledFunction(), Func) &&
-      Func == LibFunc_memcmp) {
-    if (expandMemCmp(CI, TTI, TLI, DL)) {
-      ModifiedDT = true;
-      return true;
-    }
+  if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) &&
+      Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) {
+    ModifiedDT = true;
+    return true;
   }
   return false;
 }
@@ -6018,7 +6037,7 @@
   return true;
 }
 
-bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
+bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
   // Bail out if we inserted the instruction to prevent optimizations from
   // stepping on each other's toes.
   if (InsertedInsts.count(I))
@@ -6173,7 +6192,7 @@
 // In this pass we look for GEP and cast instructions that are used
 // across basic blocks and rewrite them to improve basic-block-at-a-time
 // selection.
-bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
+bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
   SunkAddrs.clear();
   bool MadeChange = false;
 
Index: lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
===================================================================
--- lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
+++ lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
@@ -372,8 +372,11 @@
   enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
 
   CComPtr<IDiaEnumSymbols> DiaEnumerator;
-  if (S_OK != Symbol->findChildrenEx(EnumVal, nullptr, nsNone, &DiaEnumerator))
-    return nullptr;
+  if (S_OK !=
+      Symbol->findChildrenEx(EnumVal, nullptr, nsNone, &DiaEnumerator)) {
+    if (S_OK != Symbol->findChildren(EnumVal, nullptr, nsNone, &DiaEnumerator))
+      return nullptr;
+  }
 
   return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
 }
Index: lib/MC/MCExpr.cpp
===================================================================
--- lib/MC/MCExpr.cpp
+++ lib/MC/MCExpr.cpp
@@ -655,8 +655,12 @@
         // the OS X assembler will completely drop the 4. We should probably
         // include it in the relocation or produce an error if that is not
         // possible.
+        // Allow constant expressions.
         if (!A && !B)
           return true;
+        // Allows aliases with zero offset.
+        if (Res.getConstant() == 0 && (!A || !B))
+          return true;
       }
     }
 
Index: lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp
@@ -0,0 +1,353 @@
+//===-- AMDGPURegAsmNames.inc - Register asm names ----------*- C++ -*-----===//
+
+#ifdef AMDGPU_REG_ASM_NAMES
+
+static const char *const VGPR32RegNames[] = {
+    "v0",   "v1",   "v2",   "v3",   "v4",   "v5",   "v6",   "v7",   "v8",
+    "v9",   "v10",  "v11",  "v12",  "v13",  "v14",  "v15",  "v16",  "v17",
+    "v18",  "v19",  "v20",  "v21",  "v22",  "v23",  "v24",  "v25",  "v26",
+    "v27",  "v28",  "v29",  "v30",  "v31",  "v32",  "v33",  "v34",  "v35",
+    "v36",  "v37",  "v38",  "v39",  "v40",  "v41",  "v42",  "v43",  "v44",
+    "v45",  "v46",  "v47",  "v48",  "v49",  "v50",  "v51",  "v52",  "v53",
+    "v54",  "v55",  "v56",  "v57",  "v58",  "v59",  "v60",  "v61",  "v62",
+    "v63",  "v64",  "v65",  "v66",  "v67",  "v68",  "v69",  "v70",  "v71",
+    "v72",  "v73",  "v74",  "v75",  "v76",  "v77",  "v78",  "v79",  "v80",
+    "v81",  "v82",  "v83",  "v84",  "v85",  "v86",  "v87",  "v88",  "v89",
+    "v90",  "v91",  "v92",  "v93",  "v94",  "v95",  "v96",  "v97",  "v98",
+    "v99",  "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107",
+    "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", "v116",
+    "v117", "v118", "v119", "v120", "v121", "v122", "v123", "v124", "v125",
+    "v126", "v127", "v128", "v129", "v130", "v131", "v132", "v133", "v134",
+    "v135", "v136", "v137", "v138", "v139", "v140", "v141", "v142", "v143",
+    "v144", "v145", "v146", "v147", "v148", "v149", "v150", "v151", "v152",
+    "v153", "v154", "v155", "v156", "v157", "v158", "v159", "v160", "v161",
+    "v162", "v163", "v164", "v165", "v166", "v167", "v168", "v169", "v170",
+    "v171", "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
+    "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", "v188",
+    "v189", "v190", "v191", "v192", "v193", "v194", "v195", "v196", "v197",
+    "v198", "v199", "v200", "v201", "v202", "v203", "v204", "v205", "v206",
+    "v207", "v208", "v209", "v210", "v211", "v212", "v213", "v214", "v215",
+    "v216", "v217", "v218", "v219", "v220", "v221", "v222", "v223", "v224",
+    "v225", "v226", "v227", "v228", "v229", "v230", "v231", "v232", "v233",
+    "v234", "v235", "v236", "v237", "v238", "v239", "v240", "v241", "v242",
+    "v243", "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
+    "v252", "v253", "v254", "v255"
+};
+
+static const char *const SGPR32RegNames[] = {
+    "s0",   "s1",   "s2",   "s3",   "s4",  "s5",  "s6",  "s7",  "s8",  "s9",
+    "s10",  "s11",  "s12",  "s13",  "s14", "s15", "s16", "s17", "s18", "s19",
+    "s20",  "s21",  "s22",  "s23",  "s24", "s25", "s26", "s27", "s28", "s29",
+    "s30",  "s31",  "s32",  "s33",  "s34", "s35", "s36", "s37", "s38", "s39",
+    "s40",  "s41",  "s42",  "s43",  "s44", "s45", "s46", "s47", "s48", "s49",
+    "s50",  "s51",  "s52",  "s53",  "s54", "s55", "s56", "s57", "s58", "s59",
+    "s60",  "s61",  "s62",  "s63",  "s64", "s65", "s66", "s67", "s68", "s69",
+    "s70",  "s71",  "s72",  "s73",  "s74", "s75", "s76", "s77", "s78", "s79",
+    "s80",  "s81",  "s82",  "s83",  "s84", "s85", "s86", "s87", "s88", "s89",
+    "s90",  "s91",  "s92",  "s93",  "s94", "s95", "s96", "s97", "s98", "s99",
+    "s100", "s101", "s102", "s103"
+};
+
+static const char *const VGPR64RegNames[] = {
+    "v[0:1]",     "v[1:2]",     "v[2:3]",     "v[3:4]",     "v[4:5]",
+    "v[5:6]",     "v[6:7]",     "v[7:8]",     "v[8:9]",     "v[9:10]",
+    "v[10:11]",   "v[11:12]",   "v[12:13]",   "v[13:14]",   "v[14:15]",
+    "v[15:16]",   "v[16:17]",   "v[17:18]",   "v[18:19]",   "v[19:20]",
+    "v[20:21]",   "v[21:22]",   "v[22:23]",   "v[23:24]",   "v[24:25]",
+    "v[25:26]",   "v[26:27]",   "v[27:28]",   "v[28:29]",   "v[29:30]",
+    "v[30:31]",   "v[31:32]",   "v[32:33]",   "v[33:34]",   "v[34:35]",
+    "v[35:36]",   "v[36:37]",   "v[37:38]",   "v[38:39]",   "v[39:40]",
+    "v[40:41]",   "v[41:42]",   "v[42:43]",   "v[43:44]",   "v[44:45]",
+    "v[45:46]",   "v[46:47]",   "v[47:48]",   "v[48:49]",   "v[49:50]",
+    "v[50:51]",   "v[51:52]",   "v[52:53]",   "v[53:54]",   "v[54:55]",
+    "v[55:56]",   "v[56:57]",   "v[57:58]",   "v[58:59]",   "v[59:60]",
+    "v[60:61]",   "v[61:62]",   "v[62:63]",   "v[63:64]",   "v[64:65]",
+    "v[65:66]",   "v[66:67]",   "v[67:68]",   "v[68:69]",   "v[69:70]",
+    "v[70:71]",   "v[71:72]",   "v[72:73]",   "v[73:74]",   "v[74:75]",
+    "v[75:76]",   "v[76:77]",   "v[77:78]",   "v[78:79]",   "v[79:80]",
+    "v[80:81]",   "v[81:82]",   "v[82:83]",   "v[83:84]",   "v[84:85]",
+    "v[85:86]",   "v[86:87]",   "v[87:88]",   "v[88:89]",   "v[89:90]",
+    "v[90:91]",   "v[91:92]",   "v[92:93]",   "v[93:94]",   "v[94:95]",
+    "v[95:96]",   "v[96:97]",   "v[97:98]",   "v[98:99]",   "v[99:100]",
+    "v[100:101]", "v[101:102]", "v[102:103]", "v[103:104]", "v[104:105]",
+    "v[105:106]", "v[106:107]", "v[107:108]", "v[108:109]", "v[109:110]",
+    "v[110:111]", "v[111:112]", "v[112:113]", "v[113:114]", "v[114:115]",
+    "v[115:116]", "v[116:117]", "v[117:118]", "v[118:119]", "v[119:120]",
+    "v[120:121]", "v[121:122]", "v[122:123]", "v[123:124]", "v[124:125]",
+    "v[125:126]", "v[126:127]", "v[127:128]", "v[128:129]", "v[129:130]",
+    "v[130:131]", "v[131:132]", "v[132:133]", "v[133:134]", "v[134:135]",
+    "v[135:136]", "v[136:137]", "v[137:138]", "v[138:139]", "v[139:140]",
+    "v[140:141]", "v[141:142]", "v[142:143]", "v[143:144]", "v[144:145]",
+    "v[145:146]", "v[146:147]", "v[147:148]", "v[148:149]", "v[149:150]",
+    "v[150:151]", "v[151:152]", "v[152:153]", "v[153:154]", "v[154:155]",
+    "v[155:156]", "v[156:157]", "v[157:158]", "v[158:159]", "v[159:160]",
+    "v[160:161]", "v[161:162]", "v[162:163]", "v[163:164]", "v[164:165]",
+    "v[165:166]", "v[166:167]", "v[167:168]", "v[168:169]", "v[169:170]",
+    "v[170:171]", "v[171:172]", "v[172:173]", "v[173:174]", "v[174:175]",
+    "v[175:176]", "v[176:177]", "v[177:178]", "v[178:179]", "v[179:180]",
+    "v[180:181]", "v[181:182]", "v[182:183]", "v[183:184]", "v[184:185]",
+    "v[185:186]", "v[186:187]", "v[187:188]", "v[188:189]", "v[189:190]",
+    "v[190:191]", "v[191:192]", "v[192:193]", "v[193:194]", "v[194:195]",
+    "v[195:196]", "v[196:197]", "v[197:198]", "v[198:199]", "v[199:200]",
+    "v[200:201]", "v[201:202]", "v[202:203]", "v[203:204]", "v[204:205]",
+    "v[205:206]", "v[206:207]", "v[207:208]", "v[208:209]", "v[209:210]",
+    "v[210:211]", "v[211:212]", "v[212:213]", "v[213:214]", "v[214:215]",
+    "v[215:216]", "v[216:217]", "v[217:218]", "v[218:219]", "v[219:220]",
+    "v[220:221]", "v[221:222]", "v[222:223]", "v[223:224]", "v[224:225]",
+    "v[225:226]", "v[226:227]", "v[227:228]", "v[228:229]", "v[229:230]",
+    "v[230:231]", "v[231:232]", "v[232:233]", "v[233:234]", "v[234:235]",
+    "v[235:236]", "v[236:237]", "v[237:238]", "v[238:239]", "v[239:240]",
+    "v[240:241]", "v[241:242]", "v[242:243]", "v[243:244]", "v[244:245]",
+    "v[245:246]", "v[246:247]", "v[247:248]", "v[248:249]", "v[249:250]",
+    "v[250:251]", "v[251:252]", "v[252:253]", "v[253:254]", "v[254:255]"
+};
+
+static const char *const VGPR96RegNames[] = {
+    "v[0:2]",     "v[1:3]",     "v[2:4]",     "v[3:5]",     "v[4:6]",
+    "v[5:7]",     "v[6:8]",     "v[7:9]",     "v[8:10]",    "v[9:11]",
+    "v[10:12]",   "v[11:13]",   "v[12:14]",   "v[13:15]",   "v[14:16]",
+    "v[15:17]",   "v[16:18]",   "v[17:19]",   "v[18:20]",   "v[19:21]",
+    "v[20:22]",   "v[21:23]",   "v[22:24]",   "v[23:25]",   "v[24:26]",
+    "v[25:27]",   "v[26:28]",   "v[27:29]",   "v[28:30]",   "v[29:31]",
+    "v[30:32]",   "v[31:33]",   "v[32:34]",   "v[33:35]",   "v[34:36]",
+    "v[35:37]",   "v[36:38]",   "v[37:39]",   "v[38:40]",   "v[39:41]",
+    "v[40:42]",   "v[41:43]",   "v[42:44]",   "v[43:45]",   "v[44:46]",
+    "v[45:47]",   "v[46:48]",   "v[47:49]",   "v[48:50]",   "v[49:51]",
+    "v[50:52]",   "v[51:53]",   "v[52:54]",   "v[53:55]",   "v[54:56]",
+    "v[55:57]",   "v[56:58]",   "v[57:59]",   "v[58:60]",   "v[59:61]",
+    "v[60:62]",   "v[61:63]",   "v[62:64]",   "v[63:65]",   "v[64:66]",
+    "v[65:67]",   "v[66:68]",   "v[67:69]",   "v[68:70]",   "v[69:71]",
+    "v[70:72]",   "v[71:73]",   "v[72:74]",   "v[73:75]",   "v[74:76]",
+    "v[75:77]",   "v[76:78]",   "v[77:79]",   "v[78:80]",   "v[79:81]",
+    "v[80:82]",   "v[81:83]",   "v[82:84]",   "v[83:85]",   "v[84:86]",
+    "v[85:87]",   "v[86:88]",   "v[87:89]",   "v[88:90]",   "v[89:91]",
+    "v[90:92]",   "v[91:93]",   "v[92:94]",   "v[93:95]",   "v[94:96]",
+    "v[95:97]",   "v[96:98]",   "v[97:99]",   "v[98:100]",  "v[99:101]",
+    "v[100:102]", "v[101:103]", "v[102:104]", "v[103:105]", "v[104:106]",
+    "v[105:107]", "v[106:108]", "v[107:109]", "v[108:110]", "v[109:111]",
+    "v[110:112]", "v[111:113]", "v[112:114]", "v[113:115]", "v[114:116]",
+    "v[115:117]", "v[116:118]", "v[117:119]", "v[118:120]", "v[119:121]",
+    "v[120:122]", "v[121:123]", "v[122:124]", "v[123:125]", "v[124:126]",
+    "v[125:127]", "v[126:128]", "v[127:129]", "v[128:130]", "v[129:131]",
+    "v[130:132]", "v[131:133]", "v[132:134]", "v[133:135]", "v[134:136]",
+    "v[135:137]", "v[136:138]", "v[137:139]", "v[138:140]", "v[139:141]",
+    "v[140:142]", "v[141:143]", "v[142:144]", "v[143:145]", "v[144:146]",
+    "v[145:147]", "v[146:148]", "v[147:149]", "v[148:150]", "v[149:151]",
+    "v[150:152]", "v[151:153]", "v[152:154]", "v[153:155]", "v[154:156]",
+    "v[155:157]", "v[156:158]", "v[157:159]", "v[158:160]", "v[159:161]",
+    "v[160:162]", "v[161:163]", "v[162:164]", "v[163:165]", "v[164:166]",
+    "v[165:167]", "v[166:168]", "v[167:169]", "v[168:170]", "v[169:171]",
+    "v[170:172]", "v[171:173]", "v[172:174]", "v[173:175]", "v[174:176]",
+    "v[175:177]", "v[176:178]", "v[177:179]", "v[178:180]", "v[179:181]",
+    "v[180:182]", "v[181:183]", "v[182:184]", "v[183:185]", "v[184:186]",
+    "v[185:187]", "v[186:188]", "v[187:189]", "v[188:190]", "v[189:191]",
+    "v[190:192]", "v[191:193]", "v[192:194]", "v[193:195]", "v[194:196]",
+    "v[195:197]", "v[196:198]", "v[197:199]", "v[198:200]", "v[199:201]",
+    "v[200:202]", "v[201:203]", "v[202:204]", "v[203:205]", "v[204:206]",
+    "v[205:207]", "v[206:208]", "v[207:209]", "v[208:210]", "v[209:211]",
+    "v[210:212]", "v[211:213]", "v[212:214]", "v[213:215]", "v[214:216]",
+    "v[215:217]", "v[216:218]", "v[217:219]", "v[218:220]", "v[219:221]",
+    "v[220:222]", "v[221:223]", "v[222:224]", "v[223:225]", "v[224:226]",
+    "v[225:227]", "v[226:228]", "v[227:229]", "v[228:230]", "v[229:231]",
+    "v[230:232]", "v[231:233]", "v[232:234]", "v[233:235]", "v[234:236]",
+    "v[235:237]", "v[236:238]", "v[237:239]", "v[238:240]", "v[239:241]",
+    "v[240:242]", "v[241:243]", "v[242:244]", "v[243:245]", "v[244:246]",
+    "v[245:247]", "v[246:248]", "v[247:249]", "v[248:250]", "v[249:251]",
+    "v[250:252]", "v[251:253]", "v[252:254]", "v[253:255]"
+};
+
+static const char *const VGPR128RegNames[] = {
+    "v[0:3]",     "v[1:4]",     "v[2:5]",     "v[3:6]",     "v[4:7]",
+    "v[5:8]",     "v[6:9]",     "v[7:10]",    "v[8:11]",    "v[9:12]",
+    "v[10:13]",   "v[11:14]",   "v[12:15]",   "v[13:16]",   "v[14:17]",
+    "v[15:18]",   "v[16:19]",   "v[17:20]",   "v[18:21]",   "v[19:22]",
+    "v[20:23]",   "v[21:24]",   "v[22:25]",   "v[23:26]",   "v[24:27]",
+    "v[25:28]",   "v[26:29]",   "v[27:30]",   "v[28:31]",   "v[29:32]",
+    "v[30:33]",   "v[31:34]",   "v[32:35]",   "v[33:36]",   "v[34:37]",
+    "v[35:38]",   "v[36:39]",   "v[37:40]",   "v[38:41]",   "v[39:42]",
+    "v[40:43]",   "v[41:44]",   "v[42:45]",   "v[43:46]",   "v[44:47]",
+    "v[45:48]",   "v[46:49]",   "v[47:50]",   "v[48:51]",   "v[49:52]",
+    "v[50:53]",   "v[51:54]",   "v[52:55]",   "v[53:56]",   "v[54:57]",
+    "v[55:58]",   "v[56:59]",   "v[57:60]",   "v[58:61]",   "v[59:62]",
+    "v[60:63]",   "v[61:64]",   "v[62:65]",   "v[63:66]",   "v[64:67]",
+    "v[65:68]",   "v[66:69]",   "v[67:70]",   "v[68:71]",   "v[69:72]",
+    "v[70:73]",   "v[71:74]",   "v[72:75]",   "v[73:76]",   "v[74:77]",
+    "v[75:78]",   "v[76:79]",   "v[77:80]",   "v[78:81]",   "v[79:82]",
+    "v[80:83]",   "v[81:84]",   "v[82:85]",   "v[83:86]",   "v[84:87]",
+    "v[85:88]",   "v[86:89]",   "v[87:90]",   "v[88:91]",   "v[89:92]",
+    "v[90:93]",   "v[91:94]",   "v[92:95]",   "v[93:96]",   "v[94:97]",
+    "v[95:98]",   "v[96:99]",   "v[97:100]",  "v[98:101]",  "v[99:102]",
+    "v[100:103]", "v[101:104]", "v[102:105]", "v[103:106]", "v[104:107]",
+    "v[105:108]", "v[106:109]", "v[107:110]", "v[108:111]", "v[109:112]",
+    "v[110:113]", "v[111:114]", "v[112:115]", "v[113:116]", "v[114:117]",
+    "v[115:118]", "v[116:119]", "v[117:120]", "v[118:121]", "v[119:122]",
+    "v[120:123]", "v[121:124]", "v[122:125]", "v[123:126]", "v[124:127]",
+    "v[125:128]", "v[126:129]", "v[127:130]", "v[128:131]", "v[129:132]",
+    "v[130:133]", "v[131:134]", "v[132:135]", "v[133:136]", "v[134:137]",
+    "v[135:138]", "v[136:139]", "v[137:140]", "v[138:141]", "v[139:142]",
+    "v[140:143]", "v[141:144]", "v[142:145]", "v[143:146]", "v[144:147]",
+    "v[145:148]", "v[146:149]", "v[147:150]", "v[148:151]", "v[149:152]",
+    "v[150:153]", "v[151:154]", "v[152:155]", "v[153:156]", "v[154:157]",
+    "v[155:158]", "v[156:159]", "v[157:160]", "v[158:161]", "v[159:162]",
+    "v[160:163]", "v[161:164]", "v[162:165]", "v[163:166]", "v[164:167]",
+    "v[165:168]", "v[166:169]", "v[167:170]", "v[168:171]", "v[169:172]",
+    "v[170:173]", "v[171:174]", "v[172:175]", "v[173:176]", "v[174:177]",
+    "v[175:178]", "v[176:179]", "v[177:180]", "v[178:181]", "v[179:182]",
+    "v[180:183]", "v[181:184]", "v[182:185]", "v[183:186]", "v[184:187]",
+    "v[185:188]", "v[186:189]", "v[187:190]", "v[188:191]", "v[189:192]",
+    "v[190:193]", "v[191:194]", "v[192:195]", "v[193:196]", "v[194:197]",
+    "v[195:198]", "v[196:199]", "v[197:200]", "v[198:201]", "v[199:202]",
+    "v[200:203]", "v[201:204]", "v[202:205]", "v[203:206]", "v[204:207]",
+    "v[205:208]", "v[206:209]", "v[207:210]", "v[208:211]", "v[209:212]",
+    "v[210:213]", "v[211:214]", "v[212:215]", "v[213:216]", "v[214:217]",
+    "v[215:218]", "v[216:219]", "v[217:220]", "v[218:221]", "v[219:222]",
+    "v[220:223]", "v[221:224]", "v[222:225]", "v[223:226]", "v[224:227]",
+    "v[225:228]", "v[226:229]", "v[227:230]", "v[228:231]", "v[229:232]",
+    "v[230:233]", "v[231:234]", "v[232:235]", "v[233:236]", "v[234:237]",
+    "v[235:238]", "v[236:239]", "v[237:240]", "v[238:241]", "v[239:242]",
+    "v[240:243]", "v[241:244]", "v[242:245]", "v[243:246]", "v[244:247]",
+    "v[245:248]", "v[246:249]", "v[247:250]", "v[248:251]", "v[249:252]",
+    "v[250:253]", "v[251:254]", "v[252:255]"
+};
+
+static const char *const VGPR256RegNames[] = {
+    "v[0:7]",     "v[1:8]",     "v[2:9]",     "v[3:10]",    "v[4:11]",
+    "v[5:12]",    "v[6:13]",    "v[7:14]",    "v[8:15]",    "v[9:16]",
+    "v[10:17]",   "v[11:18]",   "v[12:19]",   "v[13:20]",   "v[14:21]",
+    "v[15:22]",   "v[16:23]",   "v[17:24]",   "v[18:25]",   "v[19:26]",
+    "v[20:27]",   "v[21:28]",   "v[22:29]",   "v[23:30]",   "v[24:31]",
+    "v[25:32]",   "v[26:33]",   "v[27:34]",   "v[28:35]",   "v[29:36]",
+    "v[30:37]",   "v[31:38]",   "v[32:39]",   "v[33:40]",   "v[34:41]",
+    "v[35:42]",   "v[36:43]",   "v[37:44]",   "v[38:45]",   "v[39:46]",
+    "v[40:47]",   "v[41:48]",   "v[42:49]",   "v[43:50]",   "v[44:51]",
+    "v[45:52]",   "v[46:53]",   "v[47:54]",   "v[48:55]",   "v[49:56]",
+    "v[50:57]",   "v[51:58]",   "v[52:59]",   "v[53:60]",   "v[54:61]",
+    "v[55:62]",   "v[56:63]",   "v[57:64]",   "v[58:65]",   "v[59:66]",
+    "v[60:67]",   "v[61:68]",   "v[62:69]",   "v[63:70]",   "v[64:71]",
+    "v[65:72]",   "v[66:73]",   "v[67:74]",   "v[68:75]",   "v[69:76]",
+    "v[70:77]",   "v[71:78]",   "v[72:79]",   "v[73:80]",   "v[74:81]",
+    "v[75:82]",   "v[76:83]",   "v[77:84]",   "v[78:85]",   "v[79:86]",
+    "v[80:87]",   "v[81:88]",   "v[82:89]",   "v[83:90]",   "v[84:91]",
+    "v[85:92]",   "v[86:93]",   "v[87:94]",   "v[88:95]",   "v[89:96]",
+    "v[90:97]",   "v[91:98]",   "v[92:99]",   "v[93:100]",  "v[94:101]",
+    "v[95:102]",  "v[96:103]",  "v[97:104]",  "v[98:105]",  "v[99:106]",
+    "v[100:107]", "v[101:108]", "v[102:109]", "v[103:110]", "v[104:111]",
+    "v[105:112]", "v[106:113]", "v[107:114]", "v[108:115]", "v[109:116]",
+    "v[110:117]", "v[111:118]", "v[112:119]", "v[113:120]", "v[114:121]",
+    "v[115:122]", "v[116:123]", "v[117:124]", "v[118:125]", "v[119:126]",
+    "v[120:127]", "v[121:128]", "v[122:129]", "v[123:130]", "v[124:131]",
+    "v[125:132]", "v[126:133]", "v[127:134]", "v[128:135]", "v[129:136]",
+    "v[130:137]", "v[131:138]", "v[132:139]", "v[133:140]", "v[134:141]",
+    "v[135:142]", "v[136:143]", "v[137:144]", "v[138:145]", "v[139:146]",
+    "v[140:147]", "v[141:148]", "v[142:149]", "v[143:150]", "v[144:151]",
+    "v[145:152]", "v[146:153]", "v[147:154]", "v[148:155]", "v[149:156]",
+    "v[150:157]", "v[151:158]", "v[152:159]", "v[153:160]", "v[154:161]",
+    "v[155:162]", "v[156:163]", "v[157:164]", "v[158:165]", "v[159:166]",
+    "v[160:167]", "v[161:168]", "v[162:169]", "v[163:170]", "v[164:171]",
+    "v[165:172]", "v[166:173]", "v[167:174]", "v[168:175]", "v[169:176]",
+    "v[170:177]", "v[171:178]", "v[172:179]", "v[173:180]", "v[174:181]",
+    "v[175:182]", "v[176:183]", "v[177:184]", "v[178:185]", "v[179:186]",
+    "v[180:187]", "v[181:188]", "v[182:189]", "v[183:190]", "v[184:191]",
+    "v[185:192]", "v[186:193]", "v[187:194]", "v[188:195]", "v[189:196]",
+    "v[190:197]", "v[191:198]", "v[192:199]", "v[193:200]", "v[194:201]",
+    "v[195:202]", "v[196:203]", "v[197:204]", "v[198:205]", "v[199:206]",
+    "v[200:207]", "v[201:208]", "v[202:209]", "v[203:210]", "v[204:211]",
+    "v[205:212]", "v[206:213]", "v[207:214]", "v[208:215]", "v[209:216]",
+    "v[210:217]", "v[211:218]", "v[212:219]", "v[213:220]", "v[214:221]",
+    "v[215:222]", "v[216:223]", "v[217:224]", "v[218:225]", "v[219:226]",
+    "v[220:227]", "v[221:228]", "v[222:229]", "v[223:230]", "v[224:231]",
+    "v[225:232]", "v[226:233]", "v[227:234]", "v[228:235]", "v[229:236]",
+    "v[230:237]", "v[231:238]", "v[232:239]", "v[233:240]", "v[234:241]",
+    "v[235:242]", "v[236:243]", "v[237:244]", "v[238:245]", "v[239:246]",
+    "v[240:247]", "v[241:248]", "v[242:249]", "v[243:250]", "v[244:251]",
+    "v[245:252]", "v[246:253]", "v[247:254]", "v[248:255]"
+};
+
+static const char *const VGPR512RegNames[] = {
+    "v[0:15]",    "v[1:16]",    "v[2:17]",    "v[3:18]",    "v[4:19]",
+    "v[5:20]",    "v[6:21]",    "v[7:22]",    "v[8:23]",    "v[9:24]",
+    "v[10:25]",   "v[11:26]",   "v[12:27]",   "v[13:28]",   "v[14:29]",
+    "v[15:30]",   "v[16:31]",   "v[17:32]",   "v[18:33]",   "v[19:34]",
+    "v[20:35]",   "v[21:36]",   "v[22:37]",   "v[23:38]",   "v[24:39]",
+    "v[25:40]",   "v[26:41]",   "v[27:42]",   "v[28:43]",   "v[29:44]",
+    "v[30:45]",   "v[31:46]",   "v[32:47]",   "v[33:48]",   "v[34:49]",
+    "v[35:50]",   "v[36:51]",   "v[37:52]",   "v[38:53]",   "v[39:54]",
+    "v[40:55]",   "v[41:56]",   "v[42:57]",   "v[43:58]",   "v[44:59]",
+    "v[45:60]",   "v[46:61]",   "v[47:62]",   "v[48:63]",   "v[49:64]",
+    "v[50:65]",   "v[51:66]",   "v[52:67]",   "v[53:68]",   "v[54:69]",
+    "v[55:70]",   "v[56:71]",   "v[57:72]",   "v[58:73]",   "v[59:74]",
+    "v[60:75]",   "v[61:76]",   "v[62:77]",   "v[63:78]",   "v[64:79]",
+    "v[65:80]",   "v[66:81]",   "v[67:82]",   "v[68:83]",   "v[69:84]",
+    "v[70:85]",   "v[71:86]",   "v[72:87]",   "v[73:88]",   "v[74:89]",
+    "v[75:90]",   "v[76:91]",   "v[77:92]",   "v[78:93]",   "v[79:94]",
+    "v[80:95]",   "v[81:96]",   "v[82:97]",   "v[83:98]",   "v[84:99]",
+    "v[85:100]",  "v[86:101]",  "v[87:102]",  "v[88:103]",  "v[89:104]",
+    "v[90:105]",  "v[91:106]",  "v[92:107]",  "v[93:108]",  "v[94:109]",
+    "v[95:110]",  "v[96:111]",  "v[97:112]",  "v[98:113]",  "v[99:114]",
+    "v[100:115]", "v[101:116]", "v[102:117]", "v[103:118]", "v[104:119]",
+    "v[105:120]", "v[106:121]", "v[107:122]", "v[108:123]", "v[109:124]",
+    "v[110:125]", "v[111:126]", "v[112:127]", "v[113:128]", "v[114:129]",
+    "v[115:130]", "v[116:131]", "v[117:132]", "v[118:133]", "v[119:134]",
+    "v[120:135]", "v[121:136]", "v[122:137]", "v[123:138]", "v[124:139]",
+    "v[125:140]", "v[126:141]", "v[127:142]", "v[128:143]", "v[129:144]",
+    "v[130:145]", "v[131:146]", "v[132:147]", "v[133:148]", "v[134:149]",
+    "v[135:150]", "v[136:151]", "v[137:152]", "v[138:153]", "v[139:154]",
+    "v[140:155]", "v[141:156]", "v[142:157]", "v[143:158]", "v[144:159]",
+    "v[145:160]", "v[146:161]", "v[147:162]", "v[148:163]", "v[149:164]",
+    "v[150:165]", "v[151:166]", "v[152:167]", "v[153:168]", "v[154:169]",
+    "v[155:170]", "v[156:171]", "v[157:172]", "v[158:173]", "v[159:174]",
+    "v[160:175]", "v[161:176]", "v[162:177]", "v[163:178]", "v[164:179]",
+    "v[165:180]", "v[166:181]", "v[167:182]", "v[168:183]", "v[169:184]",
+    "v[170:185]", "v[171:186]", "v[172:187]", "v[173:188]", "v[174:189]",
+    "v[175:190]", "v[176:191]", "v[177:192]", "v[178:193]", "v[179:194]",
+    "v[180:195]", "v[181:196]", "v[182:197]", "v[183:198]", "v[184:199]",
+    "v[185:200]", "v[186:201]", "v[187:202]", "v[188:203]", "v[189:204]",
+    "v[190:205]", "v[191:206]", "v[192:207]", "v[193:208]", "v[194:209]",
+    "v[195:210]", "v[196:211]", "v[197:212]", "v[198:213]", "v[199:214]",
+    "v[200:215]", "v[201:216]", "v[202:217]", "v[203:218]", "v[204:219]",
+    "v[205:220]", "v[206:221]", "v[207:222]", "v[208:223]", "v[209:224]",
+    "v[210:225]", "v[211:226]", "v[212:227]", "v[213:228]", "v[214:229]",
+    "v[215:230]", "v[216:231]", "v[217:232]", "v[218:233]", "v[219:234]",
+    "v[220:235]", "v[221:236]", "v[222:237]", "v[223:238]", "v[224:239]",
+    "v[225:240]", "v[226:241]", "v[227:242]", "v[228:243]", "v[229:244]",
+    "v[230:245]", "v[231:246]", "v[232:247]", "v[233:248]", "v[234:249]",
+    "v[235:250]", "v[236:251]", "v[237:252]", "v[238:253]", "v[239:254]",
+    "v[240:255]"
+};
+
+static const char *const SGPR64RegNames[] = {
+    "s[0:1]",   "s[2:3]",   "s[4:5]",     "s[6:7]",     "s[8:9]",   "s[10:11]",
+    "s[12:13]", "s[14:15]", "s[16:17]",   "s[18:19]",   "s[20:21]", "s[22:23]",
+    "s[24:25]", "s[26:27]", "s[28:29]",   "s[30:31]",   "s[32:33]", "s[34:35]",
+    "s[36:37]", "s[38:39]", "s[40:41]",   "s[42:43]",   "s[44:45]", "s[46:47]",
+    "s[48:49]", "s[50:51]", "s[52:53]",   "s[54:55]",   "s[56:57]", "s[58:59]",
+    "s[60:61]", "s[62:63]", "s[64:65]",   "s[66:67]",   "s[68:69]", "s[70:71]",
+    "s[72:73]", "s[74:75]", "s[76:77]",   "s[78:79]",   "s[80:81]", "s[82:83]",
+    "s[84:85]", "s[86:87]", "s[88:89]",   "s[90:91]",   "s[92:93]", "s[94:95]",
+    "s[96:97]", "s[98:99]", "s[100:101]", "s[102:103]"
+};
+
+static const char *const SGPR128RegNames[] = {
+    "s[0:3]",   "s[4:7]",     "s[8:11]",  "s[12:15]", "s[16:19]", "s[20:23]",
+    "s[24:27]", "s[28:31]",   "s[32:35]", "s[36:39]", "s[40:43]", "s[44:47]",
+    "s[48:51]", "s[52:55]",   "s[56:59]", "s[60:63]", "s[64:67]", "s[68:71]",
+    "s[72:75]", "s[76:79]",   "s[80:83]", "s[84:87]", "s[88:91]", "s[92:95]",
+    "s[96:99]", "s[100:103]"
+};
+
+static const char *const SGPR256RegNames[] = {
+    "s[0:7]",   "s[4:11]",  "s[8:15]",  "s[12:19]", "s[16:23]",
+    "s[20:27]", "s[24:31]", "s[28:35]", "s[32:39]", "s[36:43]",
+    "s[40:47]", "s[44:51]", "s[48:55]", "s[52:59]", "s[56:63]",
+    "s[60:67]", "s[64:71]", "s[68:75]", "s[72:79]", "s[76:83]",
+    "s[80:87]", "s[84:91]", "s[88:95]", "s[92:99]", "s[96:103]"
+};
+
+static const char *const SGPR512RegNames[] = {
+    "s[0:15]",  "s[4:19]",  "s[8:23]",  "s[12:27]", "s[16:31]",  "s[20:35]",
+    "s[24:39]", "s[28:43]", "s[32:47]", "s[36:51]", "s[40:55]",  "s[44:59]",
+    "s[48:63]", "s[52:67]", "s[56:71]", "s[60:75]", "s[64:79]",  "s[68:83]",
+    "s[72:87]", "s[76:91]", "s[80:95]", "s[84:99]", "s[88:103]"
+};
+
+#endif
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -58,6 +58,7 @@
   AMDGPUISelLowering.cpp
   AMDGPUInstrInfo.cpp
   AMDGPUPromoteAlloca.cpp
+  AMDGPURegAsmNames.inc.cpp
   AMDGPURegisterInfo.cpp
   AMDGPUUnifyDivergentExitNodes.cpp
   GCNHazardRecognizer.cpp
Index: lib/Target/AMDGPU/SIRegisterInfo.h
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.h
+++ lib/Target/AMDGPU/SIRegisterInfo.h
@@ -118,6 +118,8 @@
   bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
                                           int FI, RegScavenger *RS) const;
 
+  StringRef getRegAsmName(unsigned Reg) const override;
+
   unsigned getHWRegIndex(unsigned Reg) const {
     return getEncodingValue(Reg) & 0xff;
   }
Index: lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1104,6 +1104,66 @@
   }
 }
 
+StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
+  #define AMDGPU_REG_ASM_NAMES
+  #include "AMDGPURegAsmNames.inc.cpp"
+
+  #define REG_RANGE(BeginReg, EndReg, RegTable)            \
+    if (Reg >= BeginReg && Reg <= EndReg) {                \
+      unsigned Index = Reg - BeginReg;                     \
+      assert(Index < array_lengthof(RegTable));            \
+      return RegTable[Index];                              \
+    }
+
+  REG_RANGE(AMDGPU::VGPR0, AMDGPU::VGPR255, VGPR32RegNames);
+  REG_RANGE(AMDGPU::SGPR0, AMDGPU::SGPR103, SGPR32RegNames);
+  REG_RANGE(AMDGPU::VGPR0_VGPR1, AMDGPU::VGPR254_VGPR255, VGPR64RegNames);
+  REG_RANGE(AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR102_SGPR103, SGPR64RegNames);
+  REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2, AMDGPU::VGPR253_VGPR254_VGPR255,
+            VGPR96RegNames);
+
+  REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3,
+            AMDGPU::VGPR252_VGPR253_VGPR254_VGPR255,
+            VGPR128RegNames);
+  REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
+            AMDGPU::SGPR100_SGPR101_SGPR102_SGPR103,
+            SGPR128RegNames);
+
+  REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7,
+            AMDGPU::VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255,
+            VGPR256RegNames);
+
+  REG_RANGE(
+    AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15,
+    AMDGPU::VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247_VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255,
+    VGPR512RegNames);
+
+  REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7,
+            AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103,
+            SGPR256RegNames);
+
+  REG_RANGE(
+    AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7_SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15,
+    AMDGPU::SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95_SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103,
+    SGPR512RegNames
+  );
+
+#undef REG_RANGE
+
+  // FIXME: Rename flat_scr so we don't need to special case this.
+  switch (Reg) {
+  case AMDGPU::FLAT_SCR:
+    return "flat_scratch";
+  case AMDGPU::FLAT_SCR_LO:
+    return "flat_scratch_lo";
+  case AMDGPU::FLAT_SCR_HI:
+    return "flat_scratch_hi";
+  default:
+    // For the special named registers the default is fine.
+    return TargetRegisterInfo::getRegAsmName(Reg);
+  }
+}
+
 // FIXME: This is very slow. It might be worth creating a map from physreg to
 // register class.
 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
Index: lib/Target/AMDGPU/VOP3Instructions.td
===================================================================
--- lib/Target/AMDGPU/VOP3Instructions.td
+++ lib/Target/AMDGPU/VOP3Instructions.td
@@ -209,7 +209,10 @@
 }
 
 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>;
+
+let Constraints = "@earlyclobber $vdst" in {
 def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>;
+} // End Constraints = "@earlyclobber $vdst"
 
 def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
   let SchedRW = [WriteDouble];
@@ -232,8 +235,10 @@
 
 let SubtargetPredicate = isCIVI in {
 
+let Constraints = "@earlyclobber $vdst" in {
 def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
+} // End Constraints = "@earlyclobber $vdst"
 
 let isCommutable = 1 in {
 def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
Index: lib/Target/Hexagon/HexagonGenMux.cpp
===================================================================
--- lib/Target/Hexagon/HexagonGenMux.cpp
+++ lib/Target/Hexagon/HexagonGenMux.cpp
@@ -235,8 +235,11 @@
     unsigned DR = MI->getOperand(0).getReg();
     if (isRegPair(DR))
       continue;
+    MachineOperand &PredOp = MI->getOperand(1);
+    if (PredOp.isUndef())
+      continue;
 
-    unsigned PR = MI->getOperand(1).getReg();
+    unsigned PR = PredOp.getReg();
     unsigned Idx = I2X.lookup(MI);
     CondsetMap::iterator F = CM.find(DR);
     bool IfTrue = HII->isPredicatedTrue(Opc);
Index: lib/Target/Hexagon/HexagonMachineScheduler.cpp
===================================================================
--- lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -563,40 +563,33 @@
 }
 #endif
 
-/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
-/// of SU, return it, otherwise return null.
-static SUnit *getSingleUnscheduledPred(SUnit *SU) {
-  SUnit *OnlyAvailablePred = nullptr;
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    SUnit &Pred = *I->getSUnit();
-    if (!Pred.isScheduled) {
-      // We found an available, but not scheduled, predecessor.  If it's the
-      // only one we have found, keep track of it... otherwise give up.
-      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
-        return nullptr;
-      OnlyAvailablePred = &Pred;
-    }
+/// isSingleUnscheduledPred - If SU2 is the only unscheduled predecessor
+/// of SU, return true (we may have duplicates)
+static inline bool isSingleUnscheduledPred(SUnit *SU, SUnit *SU2) {
+  if (SU->NumPredsLeft == 0)
+    return false;
+
+  for (auto &Pred : SU->Preds) {
+    // We found an available, but not scheduled, predecessor.
+    if (!Pred.getSUnit()->isScheduled && (Pred.getSUnit() != SU2))
+      return false;
   }
-  return OnlyAvailablePred;
+
+  return true;
 }
 
-/// getSingleUnscheduledSucc - If there is exactly one unscheduled successor
-/// of SU, return it, otherwise return null.
-static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
-  SUnit *OnlyAvailableSucc = nullptr;
-  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    SUnit &Succ = *I->getSUnit();
-    if (!Succ.isScheduled) {
-      // We found an available, but not scheduled, successor.  If it's the
-      // only one we have found, keep track of it... otherwise give up.
-      if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ)
-        return nullptr;
-      OnlyAvailableSucc = &Succ;
-    }
+/// isSingleUnscheduledSucc - If SU2 is the only unscheduled successor
+/// of SU, return true (we may have duplicates)
+static inline bool isSingleUnscheduledSucc(SUnit *SU, SUnit *SU2) {
+  if (SU->NumSuccsLeft == 0)
+    return false;
+
+  for (auto &Succ : SU->Succs) {
+    // We found an available, but not scheduled, successor.
+    if (!Succ.getSUnit()->isScheduled && (Succ.getSUnit() != SU2))
+      return false;
   }
-  return OnlyAvailableSucc;
+  return true;
 }
 
 // Constants used to denote relative importance of
@@ -673,12 +666,12 @@
     // Count the number of nodes that
     // this node is the sole unscheduled node for.
     for (const SDep &SI : SU->Succs)
-      if (getSingleUnscheduledPred(SI.getSUnit()) == SU)
+      if (isSingleUnscheduledPred(SI.getSUnit(), SU))
         ++NumNodesBlocking;
   } else {
     // How many unscheduled predecessors block this node?
     for (const SDep &PI : SU->Preds)
-      if (getSingleUnscheduledSucc(PI.getSUnit()) == SU)
+      if (isSingleUnscheduledSucc(PI.getSUnit(), SU))
         ++NumNodesBlocking;
   }
   ResCount += (NumNodesBlocking * ScaleTwo);
Index: lib/Target/PowerPC/PPCBoolRetToInt.cpp
===================================================================
--- lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -7,15 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements converting i1 values to i32 if they could be more
+// This file implements converting i1 values to i32/i64 if they could be more
 // profitably allocated as GPRs rather than CRs. This pass will become totally
 // unnecessary if Register Bank Allocation and Global Instruction Selection ever
 // go upstream.
 //
-// Presently, the pass converts i1 Constants, and Arguments to i32 if the
+// Presently, the pass converts i1 Constants, and Arguments to i32/i64 if the
 // transitive closure of their uses includes only PHINodes, CallInsts, and
 // ReturnInsts. The rational is that arguments are generally passed and returned
-// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will
+// in GPRs rather than CRs, so casting them to i32/i64 at the LLVM IR level will
 // actually save casts at the Machine Instruction level.
 //
 // It might be useful to expand this pass to add bit-wise operations to the list
@@ -33,6 +33,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPC.h"
+#include "PPCTargetMachine.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -51,6 +52,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
 
@@ -87,17 +89,19 @@
     return Defs;
   }
 
-  // Translate a i1 value to an equivalent i32 value:
-  static Value *translate(Value *V) {
-    Type *Int32Ty = Type::getInt32Ty(V->getContext());
+  // Translate a i1 value to an equivalent i32/i64 value:
+  Value *translate(Value *V) {
+    Type *IntTy = ST->isPPC64() ? Type::getInt64Ty(V->getContext())
+                                : Type::getInt32Ty(V->getContext());
+
     if (auto *C = dyn_cast<Constant>(V))
-      return ConstantExpr::getZExt(C, Int32Ty);
+      return ConstantExpr::getZExt(C, IntTy);
     if (auto *P = dyn_cast<PHINode>(V)) {
       // Temporarily set the operands to 0. We'll fix this later in
       // runOnUse.
-      Value *Zero = Constant::getNullValue(Int32Ty);
+      Value *Zero = Constant::getNullValue(IntTy);
       PHINode *Q =
-        PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P);
+        PHINode::Create(IntTy, P->getNumIncomingValues(), P->getName(), P);
       for (unsigned i = 0; i < P->getNumOperands(); ++i)
         Q->addIncoming(Zero, P->getIncomingBlock(i));
       return Q;
@@ -109,7 +113,7 @@
 
     auto InstPt =
       A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode();
-    return new ZExtInst(V, Int32Ty, "", InstPt);
+    return new ZExtInst(V, IntTy, "", InstPt);
   }
 
   typedef SmallPtrSet<const PHINode *, 8> PHINodeSet;
@@ -185,6 +189,13 @@
     if (skipFunction(F))
       return false;
 
+    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+    if (!TPC)
+      return false;
+
+    auto &TM = TPC->getTM<PPCTargetMachine>();
+    ST = TM.getSubtargetImpl(F);
+
     PHINodeSet PromotablePHINodes = getPromotablePHINodes(F);
     B2IMap Bool2IntMap;
     bool Changed = false;
@@ -205,7 +216,7 @@
     return Changed;
   }
 
-  static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes,
+  bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes,
                        B2IMap &BoolToIntMap) {
     auto Defs = findAllDefs(U);
 
@@ -262,13 +273,16 @@
     AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
+
+private:
+  const PPCSubtarget *ST;
 };
 
 } // end anonymous namespace
 
 char PPCBoolRetToInt::ID = 0;
 INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int",
-                "Convert i1 constants to i32 if they are returned",
+                "Convert i1 constants to i32/i64 if they are returned",
                 false, false)
 
 FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); }
Index: lib/Target/PowerPC/PPCInstrVSX.td
===================================================================
--- lib/Target/PowerPC/PPCInstrVSX.td
+++ lib/Target/PowerPC/PPCInstrVSX.td
@@ -2717,6 +2717,40 @@
   dag B0 = (f32 (fpround (f64 (extractelt v2f64:$B, 0))));
   dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1))));
 }
+
+def ByteToWord {
+  dag A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8));
+  dag A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8));
+  dag A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8));
+  dag A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8));
+}
+
+def ByteToDWord {
+  dag A0 = (i64 (sext_inreg
+            (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8));
+  dag A1 = (i64 (sext_inreg
+            (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8));
+}
+
+def HWordToWord {
+  dag A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16));
+  dag A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16));
+  dag A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16));
+  dag A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16));
+}
+
+def HWordToDWord {
+  dag A0 = (i64 (sext_inreg
+            (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16));
+  dag A1 = (i64 (sext_inreg
+            (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16));
+}
+
+def WordToDWord {
+  dag A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0))));
+  dag A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2))));
+}
+
 def FltToIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (extloadf32 xoaddr:$A)))));
 }
@@ -2969,4 +3003,21 @@
               (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC),
                       (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC))>;
   }
+  // P9 Altivec instructions that can be used to build vectors.
+  // Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
+  // with complexities of existing build vector patterns in this file.
+  let Predicates = [HasP9Altivec] in {
+    def : Pat<(v2i64 (build_vector WordToDWord.A0, WordToDWord.A1)),
+              (v2i64 (VEXTSW2D $A))>;
+    def : Pat<(v2i64 (build_vector HWordToDWord.A0, HWordToDWord.A1)),
+              (v2i64 (VEXTSH2D $A))>;
+    def : Pat<(v4i32 (build_vector HWordToWord.A0, HWordToWord.A1,
+                      HWordToWord.A2, HWordToWord.A3)),
+              (v4i32 (VEXTSH2W $A))>;
+    def : Pat<(v4i32 (build_vector ByteToWord.A0, ByteToWord.A1,
+                      ByteToWord.A2, ByteToWord.A3)),
+              (v4i32 (VEXTSB2W $A))>;
+    def : Pat<(v2i64 (build_vector ByteToDWord.A0, ByteToDWord.A1)),
+              (v2i64 (VEXTSB2D $A))>;
+  }
 }
Index: lib/Target/WebAssembly/WebAssemblyFastISel.cpp
===================================================================
--- lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -69,6 +69,7 @@
     bool isFIBase() const { return Kind == FrameIndexBase; }
     void setReg(unsigned Reg) {
       assert(isRegBase() && "Invalid base register access!");
+      assert(Base.Reg == 0 && "Overwriting non-zero register");
       Base.Reg = Reg;
     }
     unsigned getReg() const {
@@ -261,22 +262,24 @@
             TmpOffset += CI->getSExtValue() * S;
             break;
           }
+          if (!canFoldAddIntoGEP(U, Op)) {
+            goto unsupported_gep;
+          }
           if (S == 1 && Addr.isRegBase() && Addr.getReg() == 0) {
+            // assert(isa<AddOperator>(Op) && "Must be an add");
+            // assert(isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)) &&
+            //        "Must have a constant operand");
             // An unscaled add of a register. Set it as the new base.
             Addr.setReg(getRegForValue(Op));
             break;
           }
-          if (canFoldAddIntoGEP(U, Op)) {
-            // A compatible add with a constant operand. Fold the constant.
-            ConstantInt *CI =
-                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
-            TmpOffset += CI->getSExtValue() * S;
-            // Iterate on the other operand.
-            Op = cast<AddOperator>(Op)->getOperand(0);
-            continue;
-          }
-          // Unsupported
-          goto unsupported_gep;
+          // A compatible add with a constant operand. Fold the constant.
+          ConstantInt *CI =
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+          TmpOffset += CI->getSExtValue() * S;
+          // Iterate on the other operand.
+          Op = cast<AddOperator>(Op)->getOperand(0);
+          continue;
         }
       }
     }
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -81,6 +81,12 @@
              " of the loop header PC will be 0)."),
     cl::Hidden);
 
+static cl::opt<bool> MulConstantOptimization(
+    "mul-constant-optimization", cl::init(true),
+    cl::desc("Replace 'mul x, Const' with more effective instructions like "
+             "SHIFT, LEA, etc."),
+    cl::Hidden);
+
 /// Call this when the user attempts to do something unsupported, like
 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
 /// report_fatal_error, so calling code should attempt to recover without
@@ -31031,6 +31037,77 @@
   }
 }
 
+static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
+                                 EVT VT, SDLoc DL) {
+
+  auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
+    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                                 DAG.getConstant(Mult, DL, VT));
+    Result = DAG.getNode(ISD::SHL, DL, VT, Result,
+                         DAG.getConstant(Shift, DL, MVT::i8));
+    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
+                         N->getOperand(0));
+    return Result;
+  };
+
+  auto combineMulMulAddOrSub = [&](bool isAdd) {
+    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                                 DAG.getConstant(9, DL, VT));
+    Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
+    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
+                         N->getOperand(0));
+    return Result;
+  };
+
+  switch (MulAmt) {
+  default:
+    break;
+  case 11:
+    // mul x, 11 => add ((shl (mul x, 5), 1), x)
+    return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
+  case 21:
+    // mul x, 21 => add ((shl (mul x, 5), 2), x)
+    return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
+  case 22:
+    // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
+    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                       combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
+  case 19:
+    // mul x, 19 => sub ((shl (mul x, 5), 2), x)
+    return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
+  case 13:
+    // mul x, 13 => add ((shl (mul x, 3), 2), x)
+    return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
+  case 23:
+    // mul x, 13 => sub ((shl (mul x, 3), 3), x)
+    return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
+  case 14:
+    // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
+    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                       combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
+  case 26:
+    // mul x, 26 => sub ((mul (mul x, 9), 3), x)
+    return combineMulMulAddOrSub(/*isAdd*/ false);
+  case 28:
+    // mul x, 28 => add ((mul (mul x, 9), 3), x)
+    return combineMulMulAddOrSub(/*isAdd*/ true);
+  case 29:
+    // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
+    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                       combineMulMulAddOrSub(/*isAdd*/ true));
+  case 30:
+    // mul x, 30 => sub (sub ((shl x, 5), x), x)
+    return DAG.getNode(
+        ISD::SUB, DL, VT,
+        DAG.getNode(ISD::SUB, DL, VT,
+                    DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                DAG.getConstant(5, DL, MVT::i8)),
+                    N->getOperand(0)),
+        N->getOperand(0));
+  }
+  return SDValue();
+}
+
 /// Optimize a single multiply with constant into two operations in order to
 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
@@ -31040,6 +31117,8 @@
   if (DCI.isBeforeLegalize() && VT.isVector())
     return reduceVMULWidth(N, DAG, Subtarget);
 
+  if (!MulConstantOptimization)
+    return SDValue();
   // An imul is usually smaller than the alternative sequence.
   if (DAG.getMachineFunction().getFunction()->optForMinSize())
     return SDValue();
@@ -31095,7 +31174,8 @@
     else
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
                            DAG.getConstant(MulAmt2, DL, VT));
-  }
+  } else if (!Subtarget.slowLEA())
+    NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
 
   if (!NewMul) {
     assert(MulAmt != 0 &&
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -5183,14 +5183,14 @@
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
-      Sched<[WriteFAdd]>;
+      Sched<[WriteFHAdd]>;
 
   def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
-        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>;
 }
 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
                   X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
@@ -5200,14 +5200,14 @@
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
-      Sched<[WriteFAdd]>;
+      Sched<[WriteFHAdd]>;
 
   def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
-        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -5310,7 +5310,7 @@
 // SSSE3 - Packed Binary Operator Instructions
 //===---------------------------------------------------------------------===//
 
-let Sched = WriteVecALU in {
+let Sched = WritePHAdd in {
 def SSE_PHADDSUBD : OpndItins<
   IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
 >;
Index: lib/Target/X86/X86SchedHaswell.td
===================================================================
--- lib/Target/X86/X86SchedHaswell.td
+++ lib/Target/X86/X86SchedHaswell.td
@@ -1488,6 +1488,39 @@
 
 //-- Arithmetic instructions --//
 
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+
+// x,m / v,v,m.
+def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+// v <- v,m.
+def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2, 1];
+}
+
 // PHADD|PHSUB (S) W/D.
 // v <- v,v.
 def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> {
Index: lib/Target/X86/X86SchedSandyBridge.td
===================================================================
--- lib/Target/X86/X86SchedSandyBridge.td
+++ lib/Target/X86/X86SchedSandyBridge.td
@@ -157,6 +157,31 @@
   let ResourceCycles = [1, 1, 1, 1];
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def : WriteRes<WriteFHAdd, [SBPort1]> {
+  let Latency = 3;
+}
+
+// x,m / v,v,m.
+def : WriteRes<WriteFHAddLd, [SBPort1, SBPort23]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def : WriteRes<WritePHAdd, [SBPort15]>;
+
+// v <- v,m.
+def : WriteRes<WritePHAddLd, [SBPort15, SBPort23]> {
+  let Latency = 5;
+  let ResourceCycles = [1, 1];
+}
+
 // String instructions.
 // Packed Compare Implicit Length Strings, Return Mask
 def : WriteRes<WritePCmpIStrM, [SBPort015]> {
Index: lib/Target/X86/X86Schedule.td
===================================================================
--- lib/Target/X86/X86Schedule.td
+++ lib/Target/X86/X86Schedule.td
@@ -77,6 +77,10 @@
 // FMA Scheduling helper class.
 class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
 
+// Horizontal Add/Sub (float and integer)
+defm WriteFHAdd  : X86SchedWritePair;
+defm WritePHAdd : X86SchedWritePair;
+
 // Vector integer operations.
 defm WriteVecALU   : X86SchedWritePair; // Vector integer ALU op, no logicals.
 defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
Index: lib/Target/X86/X86ScheduleBtVer2.td
===================================================================
--- lib/Target/X86/X86ScheduleBtVer2.td
+++ lib/Target/X86/X86ScheduleBtVer2.td
@@ -319,6 +319,38 @@
   let ResourceCycles = [1, 1];
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteFHAdd, [JFPU0]> {
+  let Latency = 3;
+}
+
+def : WriteRes<WriteFHAddLd, [JLAGU, JFPU0]> {
+  let Latency = 8;
+}
+
+def : WriteRes<WritePHAdd, [JFPU01]> {
+  let ResourceCycles = [1];
+}
+def : WriteRes<WritePHAddLd, [JLAGU, JFPU01 ]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1];
+}
+
+def WriteFHAddY: SchedWriteRes<[JFPU0]> {
+  let Latency = 3;
+  let ResourceCycles = [2];
+}
+def : InstRW<[WriteFHAddY], (instregex "VH(ADD|SUB)P(S|D)Yrr")>;
+
+def WriteFHAddYLd: SchedWriteRes<[JLAGU, JFPU0]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteFHAddYLd], (instregex "VH(ADD|SUB)P(S|D)Yrm")>;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Carry-less multiplication instructions.
 ////////////////////////////////////////////////////////////////////////////////
Index: lib/Target/X86/X86ScheduleSLM.td
===================================================================
--- lib/Target/X86/X86ScheduleSLM.td
+++ lib/Target/X86/X86ScheduleSLM.td
@@ -137,6 +137,33 @@
 defm : SMWriteResPair<WriteBlend,  FPC_RSV0,  1>;
 defm : SMWriteResPair<WriteMPSAD,  FPC_RSV0,  7>;
 
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+// HADD, HSUB PS/PD
+
+def : WriteRes<WriteFHAdd,  [FPC_RSV01]> {
+  let Latency = 3;
+  let ResourceCycles = [2];
+}
+
+def : WriteRes<WriteFHAddLd,  [FPC_RSV01, MEC_RSV]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+// PHADD|PHSUB (S) W/D.
+def : WriteRes<WritePHAdd,  [FPC_RSV01]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+
+def : WriteRes<WritePHAddLd,  [FPC_RSV01, MEC_RSV]> {
+  let Latency = 4;
+  let ResourceCycles = [1, 1];
+}
+
 // String instructions.
 // Packed Compare Implicit Length Strings, Return Mask
 def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
Index: lib/Transforms/IPO/SampleProfile.cpp
===================================================================
--- lib/Transforms/IPO/SampleProfile.cpp
+++ lib/Transforms/IPO/SampleProfile.cpp
@@ -695,6 +695,13 @@
           CallSite(I).isIndirectCall())
         for (const auto *FS : findIndirectCallFunctionSamples(*I)) {
           auto CalleeFunctionName = FS->getName();
+          // If it is a recursive call, we do not inline it as it could bloat
+          // the code exponentially. There is way to better handle this, e.g.
+          // clone the caller first, and inline the cloned caller if it is
+          // recursive. As llvm does not inline recursive calls, we will simply
+          // ignore it instead of handling it explicitly.
+          if (CalleeFunctionName == F.getName())
+            continue;
           const char *Reason = "Callee function not available";
           auto R = SymbolMap.find(CalleeFunctionName);
           if (R == SymbolMap.end())
Index: lib/Transforms/Scalar/GVNSink.cpp
===================================================================
--- lib/Transforms/Scalar/GVNSink.cpp
+++ lib/Transforms/Scalar/GVNSink.cpp
@@ -169,8 +169,8 @@
             NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
            - SplitEdgeCost;
   }
-  bool operator>=(const SinkingInstructionCandidate &Other) const {
-    return Cost >= Other.Cost;
+  bool operator>(const SinkingInstructionCandidate &Other) const {
+    return Cost > Other.Cost;
   }
 };
 
@@ -745,7 +745,7 @@
   std::stable_sort(
       Candidates.begin(), Candidates.end(),
       [](const SinkingInstructionCandidate &A,
-         const SinkingInstructionCandidate &B) { return A >= B; });
+         const SinkingInstructionCandidate &B) { return A > B; });
   DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
                                                     : Candidates) dbgs()
                                                << "  " << C << "\n";);
Index: lib/Transforms/Scalar/InferAddressSpaces.cpp
===================================================================
--- lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -500,6 +500,7 @@
   }
 
   // Computes the operands of the new constant expression.
+  bool IsNew = false;
   SmallVector<Constant *, 4> NewOperands;
   for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
     Constant *Operand = CE->getOperand(Index);
@@ -509,6 +510,7 @@
     // bitcast, and getelementptr) do not incur cycles in the data flow graph
     // and (2) this function is called on constant expressions in postorder.
     if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+      IsNew = true;
       NewOperands.push_back(cast<Constant>(NewOperand));
     } else {
       // Otherwise, reuses the old operand.
@@ -516,6 +518,11 @@
     }
   }
 
+  // If !IsNew, we will replace the Value with itself. However, replaced values
+  // are assumed to wrapped in a addrspace cast later so drop it now.
+  if (!IsNew)
+    return nullptr;
+
   if (CE->getOpcode() == Instruction::GetElementPtr) {
     // Needs to specify the source type while constructing a getelementptr
     // constant expression.
Index: lib/Transforms/Scalar/JumpThreading.cpp
===================================================================
--- lib/Transforms/Scalar/JumpThreading.cpp
+++ lib/Transforms/Scalar/JumpThreading.cpp
@@ -132,7 +132,7 @@
   bool HasProfileData = F.getEntryCount().hasValue();
   if (HasProfileData) {
     LoopInfo LI{DominatorTree(F)};
-    BPI.reset(new BranchProbabilityInfo(F, LI));
+    BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
 
@@ -152,7 +152,7 @@
   bool HasProfileData = F.getEntryCount().hasValue();
   if (HasProfileData) {
     LoopInfo LI{DominatorTree(F)};
-    BPI.reset(new BranchProbabilityInfo(F, LI));
+    BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
 
Index: test/Analysis/BranchProbabilityInfo/libfunc_call.ll
===================================================================
--- /dev/null
+++ test/Analysis/BranchProbabilityInfo/libfunc_call.ll
@@ -0,0 +1,264 @@
+; RUN: opt < %s -analyze -branch-prob | FileCheck %s
+; RUN: opt < %s -analyze -lazy-branch-prob | FileCheck %s
+; RUN: opt < %s -passes='print<branch-prob>' -disable-output 2>&1 | FileCheck %s
+
+declare i32 @strcmp(i8*, i8*)
+declare i32 @strncmp(i8*, i8*, i32)
+declare i32 @strcasecmp(i8*, i8*)
+declare i32 @strncasecmp(i8*, i8*, i32)
+declare i32 @memcmp(i8*, i8*)
+declare i32 @nonstrcmp(i8*, i8*)
+
+
+; Check that the result of strcmp is considered more likely to be nonzero than
+; zero, and equally likely to be (nonzero) positive or negative.
+
+define i32 @test_strcmp_eq(i8* %p, i8* %q) {
+; CHECK: Printing analysis {{.*}} for function 'test_strcmp_eq'
+entry:
+  %val = call i32 @strcmp(i8* %p, i8* %q)
+  %cond = icmp eq i32 %val, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 0x30000000 / 0x80000000 = 37.50%
+; CHECK: edge entry -> else probability is 0x50000000 / 0x80000000 = 62.50%
+
+then:
+  br label %exit
+; CHECK: edge then -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ 0, %then ], [ 1, %else ]
+  ret i32 %result
+}
+
+define i32 @test_strcmp_ne(i8* %p, i8* %q) {
+; CHECK: Printing analysis {{.*}} for function 'test_strcmp_ne'
+entry:
+  %val = call i32 @strcmp(i8* %p, i8* %q)
+  %cond = icmp ne i32 %val, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 0x50000000 / 0x80000000 = 62.50%
+; CHECK: edge entry -> else probability is 0x30000000 / 0x80000000 = 37.50%
+
+then:
+  br label %exit
+; CHECK: edge then -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ 0, %then ], [ 1, %else ]
+  ret i32 %result
+}
+
+define i32 @test_strcmp_sgt(i8* %p, i8* %q) {
+; CHECK: Printing analysis {{.*}} for function 'test_strcmp_sgt'
+entry:
+  %val = call i32 @strcmp(i8* %p, i8* %q)
+  %cond = icmp sgt i32 %val, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK: edge entry -> else probability is 0x40000000 / 0x80000000 = 50.00%
+
+then:
+  br label %exit
+; CHECK: edge then -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ 0, %then ], [ 1, %else ]
+  ret i32 %result
+}
+
+define i32 @test_strcmp_slt(i8* %p, i8* %q) {
+; CHECK: Printing analysis {{.*}} for function 'test_strcmp_slt'
+entry:
+  %val = call i32 @strcmp(i8* %p, i8* %q)
+  %cond = icmp slt i32 %val, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK: edge entry -> else probability is 0x40000000 / 0x80000000 = 50.00%
+
+then:
+  br label %exit
+; CHECK: edge then -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ 0, %then ], [ 1, %else ]
+  ret i32 %result
+}
+
+
+; Similarly check other library functions that have the same behaviour
+
+define i32 @test_strncmp_sgt(i8* %p, i8* %q) {
+; CHECK: Printing analysis {{.*}} for function 'test_strncmp_sgt'
+entry:
+  %val = call i32 @strncmp(i8* %p, i8* %q, i32 4)
+  %cond = icmp sgt i32 %val, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK: edge entry -> else probability is 0x40000000 / 0x80000000 = 50.00%
+
+then:
+  br label %exit
+; CHECK: edge then -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ 0, %then ], [ 1, %else ]
+  ret i32 %result
+}
+
+define i32 @test_strcasecmp_sgt(i8* %p, i8* %q) {
+; CHECK: Printing analysis {{.*}} for function 'test_strcasecmp_sgt'
+entry:
+  %val = call i32 @strcasecmp(i8* %p, i8* %q)
+  %cond = icmp sgt i32 %val, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK: edge entry -> else probability is 0x40000000 / 0x80000000 = 50.00%
+
+then:
+  br label %exit
+; CHECK: edge then -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ 0, %then ], [ 1, %else ]
+  ret i32 %result
+}
+
+define i32 @test_strncasecmp_sgt(i8* %p, i8* %q) {
+; CHECK: Printing analysis {{.*}} for function 'test_strncasecmp_sgt'
+entry:
+  %val = call i32 @strncasecmp(i8* %p, i8* %q, i32 4)
+  %cond = icmp sgt i32 %val, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK: edge entry -> else probability is 0x40000000 / 0x80000000 = 50.00%
+
+then:
+  br label %exit
+; CHECK: edge then -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ 0, %then ], [ 1, %else ]
+  ret i32 %result
+}
+
+define i32 @test_memcmp_sgt(i8* %p, i8* %q) {
+; CHECK: Printing analysis {{.*}} for function 'test_memcmp_sgt'
+entry:
+  %val = call i32 @memcmp(i8* %p, i8* %q)
+  %cond = icmp sgt i32 %val, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK: edge entry -> else probability is 0x40000000 / 0x80000000 = 50.00%
+
+then:
+  br label %exit
+; CHECK: edge then -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ 0, %then ], [ 1, %else ]
+  ret i32 %result
+}
+
+
+; Check that for the result of a call to a non-library function the default
+; heuristic is applied, i.e. positive more likely than negative, nonzero more
+; likely than zero.
+
+define i32 @test_nonstrcmp_eq(i8* %p, i8* %q) {
+; CHECK: Printing analysis {{.*}} for function 'test_nonstrcmp_eq'
+entry:
+  %val = call i32 @nonstrcmp(i8* %p, i8* %q)
+  %cond = icmp eq i32 %val, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 0x30000000 / 0x80000000 = 37.50%
+; CHECK: edge entry -> else probability is 0x50000000 / 0x80000000 = 62.50%
+
+then:
+  br label %exit
+; CHECK: edge then -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ 0, %then ], [ 1, %else ]
+  ret i32 %result
+}
+
+define i32 @test_nonstrcmp_ne(i8* %p, i8* %q) {
+; CHECK: Printing analysis {{.*}} for function 'test_nonstrcmp_ne'
+entry:
+  %val = call i32 @nonstrcmp(i8* %p, i8* %q)
+  %cond = icmp ne i32 %val, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 0x50000000 / 0x80000000 = 62.50%
+; CHECK: edge entry -> else probability is 0x30000000 / 0x80000000 = 37.50%
+
+then:
+  br label %exit
+; CHECK: edge then -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ 0, %then ], [ 1, %else ]
+  ret i32 %result
+}
+
+define i32 @test_nonstrcmp_sgt(i8* %p, i8* %q) {
+; CHECK: Printing analysis {{.*}} for function 'test_nonstrcmp_sgt'
+entry:
+  %val = call i32 @nonstrcmp(i8* %p, i8* %q)
+  %cond = icmp sgt i32 %val, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 0x50000000 / 0x80000000 = 62.50%
+; CHECK: edge entry -> else probability is 0x30000000 / 0x80000000 = 37.50%
+
+then:
+  br label %exit
+; CHECK: edge then -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ 0, %then ], [ 1, %else ]
+  ret i32 %result
+}
Index: test/CodeGen/AMDGPU/branch-relax-spill.ll
===================================================================
--- test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -7,110 +7,110 @@
 
 define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 entry:
-  %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={SGPR0}"() #0
-  %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={SGPR1}"() #0
-  %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={SGPR2}"() #0
-  %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={SGPR3}"() #0
-  %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={SGPR4}"() #0
-  %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={SGPR5}"() #0
-  %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={SGPR6}"() #0
-  %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={SGPR7}"() #0
-  %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={SGPR8}"() #0
-  %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={SGPR9}"() #0
-  %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={SGPR10}"() #0
-  %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={SGPR11}"() #0
-  %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={SGPR12}"() #0
-  %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={SGPR13}"() #0
-  %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={SGPR14}"() #0
-  %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={SGPR15}"() #0
-  %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={SGPR16}"() #0
-  %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={SGPR17}"() #0
-  %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={SGPR18}"() #0
-  %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={SGPR19}"() #0
-  %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={SGPR20}"() #0
-  %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={SGPR21}"() #0
-  %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={SGPR22}"() #0
-  %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={SGPR23}"() #0
-  %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={SGPR24}"() #0
-  %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={SGPR25}"() #0
-  %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={SGPR26}"() #0
-  %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={SGPR27}"() #0
-  %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={SGPR28}"() #0
-  %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={SGPR29}"() #0
-  %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={SGPR30}"() #0
-  %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={SGPR31}"() #0
-  %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={SGPR32}"() #0
-  %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={SGPR33}"() #0
-  %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={SGPR34}"() #0
-  %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={SGPR35}"() #0
-  %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={SGPR36}"() #0
-  %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={SGPR37}"() #0
-  %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={SGPR38}"() #0
-  %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={SGPR39}"() #0
-  %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={SGPR40}"() #0
-  %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={SGPR41}"() #0
-  %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={SGPR42}"() #0
-  %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={SGPR43}"() #0
-  %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={SGPR44}"() #0
-  %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={SGPR45}"() #0
-  %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={SGPR46}"() #0
-  %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={SGPR47}"() #0
-  %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={SGPR48}"() #0
-  %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={SGPR49}"() #0
-  %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={SGPR50}"() #0
-  %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={SGPR51}"() #0
-  %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={SGPR52}"() #0
-  %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={SGPR53}"() #0
-  %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={SGPR54}"() #0
-  %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={SGPR55}"() #0
-  %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={SGPR56}"() #0
-  %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={SGPR57}"() #0
-  %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={SGPR58}"() #0
-  %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={SGPR59}"() #0
-  %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={SGPR60}"() #0
-  %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={SGPR61}"() #0
-  %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={SGPR62}"() #0
-  %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={SGPR63}"() #0
-  %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={SGPR64}"() #0
-  %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={SGPR65}"() #0
-  %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={SGPR66}"() #0
-  %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={SGPR67}"() #0
-  %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={SGPR68}"() #0
-  %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={SGPR69}"() #0
-  %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={SGPR70}"() #0
-  %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={SGPR71}"() #0
-  %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={SGPR72}"() #0
-  %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={SGPR73}"() #0
-  %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={SGPR74}"() #0
-  %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={SGPR75}"() #0
-  %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={SGPR76}"() #0
-  %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={SGPR77}"() #0
-  %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={SGPR78}"() #0
-  %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={SGPR79}"() #0
-  %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={SGPR80}"() #0
-  %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={SGPR81}"() #0
-  %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={SGPR82}"() #0
-  %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={SGPR83}"() #0
-  %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={SGPR84}"() #0
-  %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={SGPR85}"() #0
-  %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={SGPR86}"() #0
-  %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={SGPR87}"() #0
-  %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={SGPR88}"() #0
-  %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={SGPR89}"() #0
-  %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={SGPR90}"() #0
-  %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={SGPR91}"() #0
-  %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={SGPR92}"() #0
-  %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={SGPR93}"() #0
-  %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={SGPR94}"() #0
-  %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={SGPR95}"() #0
-  %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={SGPR96}"() #0
-  %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={SGPR97}"() #0
-  %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={SGPR98}"() #0
-  %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={SGPR99}"() #0
-  %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={SGPR100}"() #0
-  %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={SGPR101}"() #0
-  %sgpr102 = tail call i32 asm sideeffect "s_mov_b32 s102, 0", "={SGPR102}"() #0
-  %sgpr103 = tail call i32 asm sideeffect "s_mov_b32 s103, 0", "={SGPR103}"() #0
+  %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0
+  %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0
+  %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0
+  %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0
+  %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0
+  %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0
+  %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0
+  %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0
+  %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0
+  %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0
+  %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0
+  %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0
+  %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0
+  %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0
+  %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0
+  %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0
+  %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0
+  %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0
+  %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0
+  %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0
+  %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0
+  %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0
+  %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0
+  %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0
+  %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0
+  %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0
+  %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0
+  %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0
+  %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0
+  %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0
+  %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0
+  %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0
+  %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0
+  %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0
+  %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0
+  %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0
+  %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0
+  %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0
+  %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0
+  %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0
+  %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0
+  %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0
+  %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0
+  %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0
+  %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0
+  %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0
+  %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0
+  %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0
+  %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0
+  %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0
+  %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0
+  %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0
+  %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0
+  %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0
+  %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0
+  %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0
+  %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0
+  %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0
+  %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0
+  %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0
+  %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0
+  %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0
+  %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0
+  %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0
+  %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0
+  %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0
+  %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0
+  %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0
+  %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0
+  %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0
+  %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0
+  %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0
+  %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0
+  %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0
+  %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0
+  %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0
+  %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0
+  %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0
+  %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0
+  %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0
+  %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0
+  %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0
+  %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0
+  %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0
+  %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0
+  %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0
+  %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0
+  %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0
+  %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0
+  %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0
+  %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0
+  %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0
+  %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0
+  %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0
+  %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0
+  %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0
+  %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0
+  %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0
+  %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0
+  %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0
+  %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0
+  %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0
+  %sgpr102 = tail call i32 asm sideeffect "s_mov_b32 s102, 0", "={s102}"() #0
+  %sgpr103 = tail call i32 asm sideeffect "s_mov_b32 s103, 0", "={s103}"() #0
   %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={VCC_LO}"() #0
   %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={VCC_HI}"() #0
   %cmp = icmp eq i32 %cnd, 0
@@ -126,112 +126,112 @@
   br label %bb3
 
 bb3:
-  tail call void asm sideeffect "; reg use $0", "{SGPR0}"(i32 %sgpr0) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR1}"(i32 %sgpr1) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR2}"(i32 %sgpr2) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR3}"(i32 %sgpr3) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR4}"(i32 %sgpr4) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR5}"(i32 %sgpr5) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR6}"(i32 %sgpr6) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR7}"(i32 %sgpr7) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR8}"(i32 %sgpr8) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR9}"(i32 %sgpr9) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR10}"(i32 %sgpr10) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR11}"(i32 %sgpr11) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR12}"(i32 %sgpr12) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR13}"(i32 %sgpr13) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR14}"(i32 %sgpr14) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR15}"(i32 %sgpr15) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR16}"(i32 %sgpr16) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR17}"(i32 %sgpr17) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR18}"(i32 %sgpr18) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR19}"(i32 %sgpr19) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR20}"(i32 %sgpr20) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR21}"(i32 %sgpr21) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR22}"(i32 %sgpr22) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR23}"(i32 %sgpr23) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR24}"(i32 %sgpr24) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR25}"(i32 %sgpr25) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR26}"(i32 %sgpr26) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR27}"(i32 %sgpr27) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR28}"(i32 %sgpr28) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR29}"(i32 %sgpr29) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR30}"(i32 %sgpr30) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR31}"(i32 %sgpr31) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR32}"(i32 %sgpr32) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR33}"(i32 %sgpr33) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR34}"(i32 %sgpr34) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR35}"(i32 %sgpr35) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR36}"(i32 %sgpr36) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR37}"(i32 %sgpr37) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR38}"(i32 %sgpr38) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR39}"(i32 %sgpr39) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR40}"(i32 %sgpr40) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR41}"(i32 %sgpr41) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR42}"(i32 %sgpr42) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR43}"(i32 %sgpr43) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR44}"(i32 %sgpr44) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR45}"(i32 %sgpr45) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR46}"(i32 %sgpr46) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR47}"(i32 %sgpr47) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR48}"(i32 %sgpr48) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR49}"(i32 %sgpr49) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR50}"(i32 %sgpr50) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR51}"(i32 %sgpr51) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR52}"(i32 %sgpr52) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR53}"(i32 %sgpr53) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR54}"(i32 %sgpr54) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR55}"(i32 %sgpr55) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR56}"(i32 %sgpr56) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR57}"(i32 %sgpr57) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR58}"(i32 %sgpr58) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR59}"(i32 %sgpr59) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR60}"(i32 %sgpr60) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR61}"(i32 %sgpr61) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR62}"(i32 %sgpr62) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR63}"(i32 %sgpr63) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR64}"(i32 %sgpr64) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR65}"(i32 %sgpr65) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR66}"(i32 %sgpr66) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR67}"(i32 %sgpr67) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR68}"(i32 %sgpr68) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR69}"(i32 %sgpr69) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR70}"(i32 %sgpr70) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR71}"(i32 %sgpr71) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR72}"(i32 %sgpr72) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR73}"(i32 %sgpr73) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR74}"(i32 %sgpr74) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR75}"(i32 %sgpr75) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR76}"(i32 %sgpr76) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR77}"(i32 %sgpr77) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR78}"(i32 %sgpr78) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR79}"(i32 %sgpr79) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR80}"(i32 %sgpr80) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR81}"(i32 %sgpr81) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR82}"(i32 %sgpr82) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR83}"(i32 %sgpr83) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR84}"(i32 %sgpr84) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR85}"(i32 %sgpr85) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR86}"(i32 %sgpr86) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR87}"(i32 %sgpr87) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR88}"(i32 %sgpr88) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR89}"(i32 %sgpr89) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR90}"(i32 %sgpr90) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR91}"(i32 %sgpr91) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR92}"(i32 %sgpr92) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR93}"(i32 %sgpr93) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR94}"(i32 %sgpr94) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR95}"(i32 %sgpr95) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR96}"(i32 %sgpr96) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR97}"(i32 %sgpr97) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR98}"(i32 %sgpr98) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR99}"(i32 %sgpr99) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR100}"(i32 %sgpr100) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR101}"(i32 %sgpr101) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR102}"(i32 %sgpr102) #0
-  tail call void asm sideeffect "; reg use $0", "{SGPR103}"(i32 %sgpr103) #0
-  tail call void asm sideeffect "; reg use $0", "{VCC_LO}"(i32 %vcc_lo) #0
-  tail call void asm sideeffect "; reg use $0", "{VCC_HI}"(i32 %vcc_hi) #0
+  tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0
+  tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0
+  tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0
+  tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0
+  tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0
+  tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0
+  tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0
+  tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0
+  tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0
+  tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0
+  tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0
+  tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0
+  tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0
+  tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0
+  tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0
+  tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0
+  tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0
+  tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0
+  tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0
+  tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0
+  tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0
+  tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0
+  tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0
+  tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0
+  tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0
+  tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0
+  tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0
+  tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0
+  tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0
+  tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0
+  tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0
+  tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0
+  tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0
+  tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0
+  tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0
+  tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0
+  tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0
+  tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0
+  tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0
+  tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0
+  tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0
+  tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0
+  tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0
+  tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0
+  tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0
+  tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0
+  tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0
+  tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0
+  tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0
+  tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0
+  tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0
+  tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0
+  tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0
+  tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0
+  tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0
+  tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0
+  tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0
+  tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0
+  tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0
+  tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0
+  tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0
+  tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0
+  tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0
+  tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0
+  tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0
+  tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0
+  tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0
+  tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0
+  tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0
+  tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0
+  tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0
+  tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0
+  tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0
+  tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0
+  tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0
+  tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0
+  tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0
+  tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0
+  tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0
+  tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0
+  tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0
+  tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0
+  tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0
+  tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0
+  tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0
+  tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0
+  tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0
+  tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0
+  tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0
+  tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0
+  tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0
+  tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0
+  tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0
+  tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0
+  tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0
+  tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0
+  tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0
+  tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0
+  tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0
+  tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0
+  tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0
+  tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0
+  tail call void asm sideeffect "; reg use $0", "{s102}"(i32 %sgpr102) #0
+  tail call void asm sideeffect "; reg use $0", "{s103}"(i32 %sgpr103) #0
+  tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0
+  tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0
   ret void
 }
 
Index: test/CodeGen/AMDGPU/exceed-max-sgprs.ll
===================================================================
--- test/CodeGen/AMDGPU/exceed-max-sgprs.ll
+++ test/CodeGen/AMDGPU/exceed-max-sgprs.ll
@@ -2,97 +2,97 @@
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_tahiti
 define amdgpu_kernel void @use_too_many_sgprs_tahiti() #0 {
-  call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
-  call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
-  call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
-  call void asm sideeffect "", "~{SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}" ()
-  call void asm sideeffect "", "~{SGPR32_SGPR33_SGPR34_SGPR35_SGPR36_SGPR37_SGPR38_SGPR39}" ()
-  call void asm sideeffect "", "~{SGPR40_SGPR41_SGPR42_SGPR43_SGPR44_SGPR45_SGPR46_SGPR47}" ()
-  call void asm sideeffect "", "~{SGPR48_SGPR49_SGPR50_SGPR51_SGPR52_SGPR53_SGPR54_SGPR55}" ()
-  call void asm sideeffect "", "~{SGPR56_SGPR57_SGPR58_SGPR59_SGPR60_SGPR61_SGPR62_SGPR63}" ()
-  call void asm sideeffect "", "~{SGPR64_SGPR65_SGPR66_SGPR67_SGPR68_SGPR69_SGPR70_SGPR71}" ()
-  call void asm sideeffect "", "~{SGPR72_SGPR73_SGPR74_SGPR75_SGPR76_SGPR77_SGPR78_SGPR79}" ()
-  call void asm sideeffect "", "~{SGPR80_SGPR81_SGPR82_SGPR83_SGPR84_SGPR85_SGPR86_SGPR87}" ()
-  call void asm sideeffect "", "~{SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95}" ()
-  call void asm sideeffect "", "~{SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103}" ()
-  call void asm sideeffect "", "~{VCC}" ()
+  call void asm sideeffect "", "~{s[0:7]}" ()
+  call void asm sideeffect "", "~{s[8:15]}" ()
+  call void asm sideeffect "", "~{s[16:23]}" ()
+  call void asm sideeffect "", "~{s[24:31]}" ()
+  call void asm sideeffect "", "~{s[32:39]}" ()
+  call void asm sideeffect "", "~{s[40:47]}" ()
+  call void asm sideeffect "", "~{s[48:55]}" ()
+  call void asm sideeffect "", "~{s[56:63]}" ()
+  call void asm sideeffect "", "~{s[64:71]}" ()
+  call void asm sideeffect "", "~{s[72:79]}" ()
+  call void asm sideeffect "", "~{s[80:87]}" ()
+  call void asm sideeffect "", "~{s[88:95]}" ()
+  call void asm sideeffect "", "~{s[96:103]}" ()
+  call void asm sideeffect "", "~{vcc}" ()
   ret void
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire
 define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 {
-  call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
-  call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
-  call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
-  call void asm sideeffect "", "~{SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}" ()
-  call void asm sideeffect "", "~{SGPR32_SGPR33_SGPR34_SGPR35_SGPR36_SGPR37_SGPR38_SGPR39}" ()
-  call void asm sideeffect "", "~{SGPR40_SGPR41_SGPR42_SGPR43_SGPR44_SGPR45_SGPR46_SGPR47}" ()
-  call void asm sideeffect "", "~{SGPR48_SGPR49_SGPR50_SGPR51_SGPR52_SGPR53_SGPR54_SGPR55}" ()
-  call void asm sideeffect "", "~{SGPR56_SGPR57_SGPR58_SGPR59_SGPR60_SGPR61_SGPR62_SGPR63}" ()
-  call void asm sideeffect "", "~{SGPR64_SGPR65_SGPR66_SGPR67_SGPR68_SGPR69_SGPR70_SGPR71}" ()
-  call void asm sideeffect "", "~{SGPR72_SGPR73_SGPR74_SGPR75_SGPR76_SGPR77_SGPR78_SGPR79}" ()
-  call void asm sideeffect "", "~{SGPR80_SGPR81_SGPR82_SGPR83_SGPR84_SGPR85_SGPR86_SGPR87}" ()
-  call void asm sideeffect "", "~{SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95}" ()
-  call void asm sideeffect "", "~{SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103}" ()
-  call void asm sideeffect "", "~{VCC}" ()
+  call void asm sideeffect "", "~{s[0:7]}" ()
+  call void asm sideeffect "", "~{s[8:15]}" ()
+  call void asm sideeffect "", "~{s[16:23]}" ()
+  call void asm sideeffect "", "~{s[24:31]}" ()
+  call void asm sideeffect "", "~{s[32:39]}" ()
+  call void asm sideeffect "", "~{s[40:47]}" ()
+  call void asm sideeffect "", "~{s[48:55]}" ()
+  call void asm sideeffect "", "~{s[56:63]}" ()
+  call void asm sideeffect "", "~{s[64:71]}" ()
+  call void asm sideeffect "", "~{s[72:79]}" ()
+  call void asm sideeffect "", "~{s[80:87]}" ()
+  call void asm sideeffect "", "~{s[88:95]}" ()
+  call void asm sideeffect "", "~{s[96:103]}" ()
+  call void asm sideeffect "", "~{vcc}" ()
   ret void
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (108) in use_too_many_sgprs_bonaire_flat_scr
 define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 {
-  call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
-  call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
-  call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
-  call void asm sideeffect "", "~{SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}" ()
-  call void asm sideeffect "", "~{SGPR32_SGPR33_SGPR34_SGPR35_SGPR36_SGPR37_SGPR38_SGPR39}" ()
-  call void asm sideeffect "", "~{SGPR40_SGPR41_SGPR42_SGPR43_SGPR44_SGPR45_SGPR46_SGPR47}" ()
-  call void asm sideeffect "", "~{SGPR48_SGPR49_SGPR50_SGPR51_SGPR52_SGPR53_SGPR54_SGPR55}" ()
-  call void asm sideeffect "", "~{SGPR56_SGPR57_SGPR58_SGPR59_SGPR60_SGPR61_SGPR62_SGPR63}" ()
-  call void asm sideeffect "", "~{SGPR64_SGPR65_SGPR66_SGPR67_SGPR68_SGPR69_SGPR70_SGPR71}" ()
-  call void asm sideeffect "", "~{SGPR72_SGPR73_SGPR74_SGPR75_SGPR76_SGPR77_SGPR78_SGPR79}" ()
-  call void asm sideeffect "", "~{SGPR80_SGPR81_SGPR82_SGPR83_SGPR84_SGPR85_SGPR86_SGPR87}" ()
-  call void asm sideeffect "", "~{SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95}" ()
-  call void asm sideeffect "", "~{SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103}" ()
-  call void asm sideeffect "", "~{VCC}" ()
-  call void asm sideeffect "", "~{FLAT_SCR}" ()
+  call void asm sideeffect "", "~{s[0:7]}" ()
+  call void asm sideeffect "", "~{s[8:15]}" ()
+  call void asm sideeffect "", "~{s[16:23]}" ()
+  call void asm sideeffect "", "~{s[24:31]}" ()
+  call void asm sideeffect "", "~{s[32:39]}" ()
+  call void asm sideeffect "", "~{s[40:47]}" ()
+  call void asm sideeffect "", "~{s[48:55]}" ()
+  call void asm sideeffect "", "~{s[56:63]}" ()
+  call void asm sideeffect "", "~{s[64:71]}" ()
+  call void asm sideeffect "", "~{s[72:79]}" ()
+  call void asm sideeffect "", "~{s[80:87]}" ()
+  call void asm sideeffect "", "~{s[88:95]}" ()
+  call void asm sideeffect "", "~{s[96:103]}" ()
+  call void asm sideeffect "", "~{vcc}" ()
+  call void asm sideeffect "", "~{flat_scratch}" ()
   ret void
 }
 
 ; ERROR: error: scalar registers limit of 96 exceeded (98) in use_too_many_sgprs_iceland
 define amdgpu_kernel void @use_too_many_sgprs_iceland() #2 {
-  call void asm sideeffect "", "~{VCC}" ()
-  call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
-  call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
-  call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
-  call void asm sideeffect "", "~{SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}" ()
-  call void asm sideeffect "", "~{SGPR32_SGPR33_SGPR34_SGPR35_SGPR36_SGPR37_SGPR38_SGPR39}" ()
-  call void asm sideeffect "", "~{SGPR40_SGPR41_SGPR42_SGPR43_SGPR44_SGPR45_SGPR46_SGPR47}" ()
-  call void asm sideeffect "", "~{SGPR48_SGPR49_SGPR50_SGPR51_SGPR52_SGPR53_SGPR54_SGPR55}" ()
-  call void asm sideeffect "", "~{SGPR56_SGPR57_SGPR58_SGPR59_SGPR60_SGPR61_SGPR62_SGPR63}" ()
-  call void asm sideeffect "", "~{SGPR64_SGPR65_SGPR66_SGPR67_SGPR68_SGPR69_SGPR70_SGPR71}" ()
-  call void asm sideeffect "", "~{SGPR72_SGPR73_SGPR74_SGPR75_SGPR76_SGPR77_SGPR78_SGPR79}" ()
-  call void asm sideeffect "", "~{SGPR80_SGPR81_SGPR82_SGPR83_SGPR84_SGPR85_SGPR86_SGPR87}" ()
-  call void asm sideeffect "", "~{SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95}" ()
+  call void asm sideeffect "", "~{vcc}" ()
+  call void asm sideeffect "", "~{s[0:7]}" ()
+  call void asm sideeffect "", "~{s[8:15]}" ()
+  call void asm sideeffect "", "~{s[16:23]}" ()
+  call void asm sideeffect "", "~{s[24:31]}" ()
+  call void asm sideeffect "", "~{s[32:39]}" ()
+  call void asm sideeffect "", "~{s[40:47]}" ()
+  call void asm sideeffect "", "~{s[48:55]}" ()
+  call void asm sideeffect "", "~{s[56:63]}" ()
+  call void asm sideeffect "", "~{s[64:71]}" ()
+  call void asm sideeffect "", "~{s[72:79]}" ()
+  call void asm sideeffect "", "~{s[80:87]}" ()
+  call void asm sideeffect "", "~{s[88:95]}" ()
   ret void
 }
 
 ; ERROR: error: addressable scalar registers limit of 102 exceeded (103) in use_too_many_sgprs_fiji
 define amdgpu_kernel void @use_too_many_sgprs_fiji() #3 {
-  call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
-  call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
-  call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
-  call void asm sideeffect "", "~{SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}" ()
-  call void asm sideeffect "", "~{SGPR32_SGPR33_SGPR34_SGPR35_SGPR36_SGPR37_SGPR38_SGPR39}" ()
-  call void asm sideeffect "", "~{SGPR40_SGPR41_SGPR42_SGPR43_SGPR44_SGPR45_SGPR46_SGPR47}" ()
-  call void asm sideeffect "", "~{SGPR48_SGPR49_SGPR50_SGPR51_SGPR52_SGPR53_SGPR54_SGPR55}" ()
-  call void asm sideeffect "", "~{SGPR56_SGPR57_SGPR58_SGPR59_SGPR60_SGPR61_SGPR62_SGPR63}" ()
-  call void asm sideeffect "", "~{SGPR64_SGPR65_SGPR66_SGPR67_SGPR68_SGPR69_SGPR70_SGPR71}" ()
-  call void asm sideeffect "", "~{SGPR72_SGPR73_SGPR74_SGPR75_SGPR76_SGPR77_SGPR78_SGPR79}" ()
-  call void asm sideeffect "", "~{SGPR80_SGPR81_SGPR82_SGPR83_SGPR84_SGPR85_SGPR86_SGPR87}" ()
-  call void asm sideeffect "", "~{SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95}" ()
-  call void asm sideeffect "", "~{SGPR96_SGPR97_SGPR98_SGPR99}" ()
-  call void asm sideeffect "", "~{SGPR100_SGPR101}" ()
-  call void asm sideeffect "", "~{SGPR102}" ()
+  call void asm sideeffect "", "~{s[0:7]}" ()
+  call void asm sideeffect "", "~{s[8:15]}" ()
+  call void asm sideeffect "", "~{s[16:23]}" ()
+  call void asm sideeffect "", "~{s[24:31]}" ()
+  call void asm sideeffect "", "~{s[32:39]}" ()
+  call void asm sideeffect "", "~{s[40:47]}" ()
+  call void asm sideeffect "", "~{s[48:55]}" ()
+  call void asm sideeffect "", "~{s[56:63]}" ()
+  call void asm sideeffect "", "~{s[64:71]}" ()
+  call void asm sideeffect "", "~{s[72:79]}" ()
+  call void asm sideeffect "", "~{s[80:87]}" ()
+  call void asm sideeffect "", "~{s[88:95]}" ()
+  call void asm sideeffect "", "~{s[96:99]}" ()
+  call void asm sideeffect "", "~{s[100:101]}" ()
+  call void asm sideeffect "", "~{s102}" ()
   ret void
 }
 
Index: test/CodeGen/AMDGPU/flat-scratch-reg.ll
===================================================================
--- test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -21,7 +21,7 @@
 ; VI-XNACK: ; NumSgprs: 12
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
-  call void asm sideeffect "", "~{SGPR7}"()
+  call void asm sideeffect "", "~{s7}"()
   ret void
 }
 
@@ -35,7 +35,7 @@
 ; VI-XNACK: ; NumSgprs: 12
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
-  call void asm sideeffect "", "~{SGPR7},~{VCC}"()
+  call void asm sideeffect "", "~{s7},~{vcc}"()
   ret void
 }
 
@@ -52,7 +52,7 @@
 ; HSA-VI-XNACK: ; NumSgprs: 14
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
-  call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
+  call void asm sideeffect "", "~{s7},~{flat_scratch}"()
   ret void
 }
 
@@ -68,7 +68,7 @@
 ; HSA-VI-XNACK: ; NumSgprs: 14
 define amdgpu_kernel void @vcc_flat() {
 entry:
-  call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
+  call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
   ret void
 }
 
@@ -81,7 +81,7 @@
 ; VI-XNACK: NumSgprs: 6
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
-  call void asm sideeffect "; clobber ", "~{FLAT_SCR}"()
+  call void asm sideeffect "; clobber ", "~{flat_scratch}"()
   ret void
 }
 
@@ -91,7 +91,7 @@
 ; VI-XNACK: NumSgprs: 6
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
-  call void asm sideeffect "; clobber ", "~{FLAT_SCR_LO}"()
+  call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
   ret void
 }
 
@@ -101,7 +101,7 @@
 ; VI-XNACK: NumSgprs: 6
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
-  call void asm sideeffect "; clobber ", "~{FLAT_SCR_HI}"()
+  call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
   ret void
 }
 
Index: test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
===================================================================
--- test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
+++ test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
@@ -5,40 +5,40 @@
 ; GCN: ; illegal copy v1 to s9
 
 define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_i32() #0 {
-  %vgpr = call i32 asm sideeffect "; def $0", "=${VGPR1}"()
-  call void asm sideeffect "; use $0", "${SGPR9}"(i32 %vgpr)
+  %vgpr = call i32 asm sideeffect "; def $0", "=${v1}"()
+  call void asm sideeffect "; use $0", "${s9}"(i32 %vgpr)
   ret void
 }
 
 ; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v2i32 void (): illegal SGPR to VGPR copy
 ; GCN: ; illegal copy v[0:1] to s[10:11]
 define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v2i32() #0 {
-  %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1}"()
-  call void asm sideeffect "; use $0", "${SGPR10_SGPR11}"(<2 x i32> %vgpr)
+  %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${v[0:1]}"()
+  call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr)
   ret void
 }
 
 ; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v4i32 void (): illegal SGPR to VGPR copy
 ; GCN: ; illegal copy v[0:3] to s[8:11]
 define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v4i32() #0 {
-  %vgpr = call <4 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3}"()
-  call void asm sideeffect "; use $0", "${SGPR8_SGPR9_SGPR10_SGPR11}"(<4 x i32> %vgpr)
+  %vgpr = call <4 x i32> asm sideeffect "; def $0", "=${v[0:3]}"()
+  call void asm sideeffect "; use $0", "${s[8:11]}"(<4 x i32> %vgpr)
   ret void
 }
 
 ; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v8i32 void (): illegal SGPR to VGPR copy
 ; GCN: ; illegal copy v[0:7] to s[8:15]
 define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v8i32() #0 {
-  %vgpr = call <8 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}"()
-  call void asm sideeffect "; use $0", "${SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}"(<8 x i32> %vgpr)
+  %vgpr = call <8 x i32> asm sideeffect "; def $0", "=${v[0:7]}"()
+  call void asm sideeffect "; use $0", "${s[8:15]}"(<8 x i32> %vgpr)
   ret void
 }
 
 ; ERR error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v16i32 void (): illegal SGPR to VGPR copy
 ; GCN: ; illegal copy v[0:15] to s[16:31]
 define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 {
-  %vgpr = call <16 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}"()
-  call void asm sideeffect "; use $0", "${SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23_SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}"(<16 x i32> %vgpr)
+  %vgpr = call <16 x i32> asm sideeffect "; def $0", "=${v[0:15]}"()
+  call void asm sideeffect "; use $0", "${s[16:31]}"(<16 x i32> %vgpr)
   ret void
 }
 
Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll
===================================================================
--- test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -379,7 +379,7 @@
   %idx0 = load volatile i32, i32 addrspace(1)* %gep
   %idx1 = add i32 %idx0, 1
   %val0 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx0
-  %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={SGPR4}" ()
+  %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
   %val1 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx1
   store volatile i32 %val0, i32 addrspace(1)* %out0
   store volatile i32 %val1, i32 addrspace(1)* %out0
Index: test/CodeGen/AMDGPU/inline-asm.ll
===================================================================
--- test/CodeGen/AMDGPU/inline-asm.ll
+++ test/CodeGen/AMDGPU/inline-asm.ll
@@ -193,7 +193,7 @@
 ; CHECK: use v[0:1]
 define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
 entry:
-  call void asm sideeffect "; use $0 ", "{VGPR0_VGPR1}"(i64 123456)
+  call void asm sideeffect "; use $0 ", "{v[0:1]}"(i64 123456)
   ret void
 }
 
@@ -202,7 +202,7 @@
 ; CHECK: ; use v0
 define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
 entry:
-  call void asm sideeffect "; use $0 ", "{VGPR0}"(i1 true)
+  call void asm sideeffect "; use $0 ", "{v0}"(i1 true)
   ret void
 }
 
@@ -215,7 +215,7 @@
 define amdgpu_kernel void @i1_input_phys_vgpr() {
 entry:
   %val = load i1, i1 addrspace(1)* undef
-  call void asm sideeffect "; use $0 ", "{VGPR0}"(i1 %val)
+  call void asm sideeffect "; use $0 ", "{v0}"(i1 %val)
   ret void
 }
 
@@ -229,7 +229,7 @@
 entry:
   %val0 = load volatile i1, i1 addrspace(1)* undef
   %val1 = load volatile i1, i1 addrspace(1)* undef
-  call void asm sideeffect "; use $0 $1 ", "{VGPR0}, {VGPR1}"(i1 %val0, i1 %val1)
+  call void asm sideeffect "; use $0 $1 ", "{v0}, {v1}"(i1 %val0, i1 %val1)
   ret void
 }
 
@@ -240,8 +240,8 @@
 ; CHECK: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1
 define amdgpu_kernel void @muliple_def_phys_vgpr() {
 entry:
-  %def0 = call i32 asm sideeffect "; def $0 ", "={VGPR0}"()
-  %def1 = call i32 asm sideeffect "; def $0 ", "={VGPR0}"()
+  %def0 = call i32 asm sideeffect "; def $0 ", "={v0}"()
+  %def1 = call i32 asm sideeffect "; def $0 ", "={v0}"()
   %add = shl i32 %def0, %def1
   store i32 %add, i32 addrspace(1)* undef
   ret void
Index: test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
@@ -4,18 +4,28 @@
 declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8:
-; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v5, v1
+; GCN-DAG: v_mov_b32_e32 v4, v0
 define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
-  %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[4:5]},v"(i64 %src) #0
+  %tmp1 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
+  %tmp2 = call i64 asm ";; force constraint", "=v,{v[4:5]}"(i64 %tmp1) #0
+  store i64 %tmp2, i64 addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8_non_immediate:
-; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_mqsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
+; GCN-DAG: v_mov_b32_e32 v3, v1
+; GCN-DAG: v_mov_b32_e32 v2, v0
 define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
-  %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
+  %tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={v[6:7]},v"(i64 %b) #0
+  %tmp3 = call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
+  %tmp4 = call i64 asm ";; force constraint", "=v,{v[2:3]}"(i64 %tmp3) #0
+  store i64 %tmp4, i64 addrspace(1)* %out, align 4
   ret void
 }
 
Index: test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
@@ -3,45 +3,56 @@
 
 declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0
 
-; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> <i32 100, i32 100, i32 100, i32 100>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{v[2:5]}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
 define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %b) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{v[2:5]}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
 define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{v[2:5]}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr:
-; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v0, v2
+; GCN-DAG: v_mov_b32_e32 v1, v3
+; GCN: v_mqsad_u32_u8 v[2:5], v[0:1], v6, v[{{[0-9]+:[0-9]+}}]
 define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
   %in = load <4 x i32>, <4 x i32> addrspace(1) * %input
-
-  %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0
-  store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
+  %tmp2 = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %tmp, i32 %tmp1, <4 x i32> %in) #0
+  %tmp3 = call <4 x i32>  asm ";; force constraint", "=v,{v[2:5]}"(<4 x i32> %tmp2) #0
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
Index: test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
@@ -4,18 +4,28 @@
 declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8:
-; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-DAG: v_mov_b32_e32 v5, v1
+; GCN-DAG: v_mov_b32_e32 v4, v0
 define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
-  %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[4:5]},v"(i64 %src) #0
+  %tmp1 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 100, i64 100) #0
+  %tmp2 = call i64 asm ";; force constraint", "=v,{v[4:5]}"(i64 %tmp1) #0
+  store i64 %tmp2, i64 addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8_non_immediate:
-; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_qsad_pk_u16_u8 v[0:1], v[2:3], v4, v[6:7]
+; GCN-DAG: v_mov_b32_e32 v3, v1
+; GCN-DAG: v_mov_b32_e32 v2, v0
 define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
-  %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
-  store i64 %result, i64 addrspace(1)* %out, align 4
+  %tmp = call i64 asm "v_lsrlrev_b64 $0, $1, 1", "={v[2:3]},v"(i64 %src) #0
+  %tmp1 = call i32 asm "v_mov_b32 $0, $1", "={v4},v"(i32 %a) #0
+  %tmp2 = call i64 asm "v_lshlrev_b64 $0, $1, 1", "={v[6:7]},v"(i64 %b) #0
+  %tmp3 = call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %tmp, i32 %tmp1, i64 %tmp2) #0
+  %tmp4 = call i64 asm ";; force constraint", "=v,{v[2:3]}"(i64 %tmp3) #0
+  store i64 %tmp4, i64 addrspace(1)* %out, align 4
   ret void
 }
 
Index: test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
===================================================================
--- test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -608,11 +608,11 @@
 ; GCN: ;;#ASMSTART
 ; GCN: ; use s[0:1]
 define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
-  call void asm sideeffect "", "~{VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}" () #0
-  call void asm sideeffect "", "~{VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}" () #0
-  call void asm sideeffect "", "~{VGPR16_VGPR17_VGPR18_VGPR19}"() #0
-  call void asm sideeffect "", "~{VGPR20_VGPR21}"() #0
-  call void asm sideeffect "", "~{VGPR22}"() #0
+  call void asm sideeffect "", "~{v[0:7]}" () #0
+  call void asm sideeffect "", "~{v[8:15]}" () #0
+  call void asm sideeffect "", "~{v[16:19]}"() #0
+  call void asm sideeffect "", "~{v[20:21]}"() #0
+  call void asm sideeffect "", "~{v22}"() #0
 
   %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
   %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
Index: test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
===================================================================
--- test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -25,50 +25,50 @@
 ; SMEM: s_dcache_wb
 ; ALL: s_endpgm
 define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
-  call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
-  call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
-  call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
-  call void asm sideeffect "", "~{SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}" ()
-  call void asm sideeffect "", "~{SGPR32_SGPR33_SGPR34_SGPR35_SGPR36_SGPR37_SGPR38_SGPR39}" ()
-  call void asm sideeffect "", "~{SGPR40_SGPR41_SGPR42_SGPR43_SGPR44_SGPR45_SGPR46_SGPR47}" ()
-  call void asm sideeffect "", "~{SGPR48_SGPR49_SGPR50_SGPR51_SGPR52_SGPR53_SGPR54_SGPR55}" ()
-  call void asm sideeffect "", "~{SGPR56_SGPR57_SGPR58_SGPR59_SGPR60_SGPR61_SGPR62_SGPR63}" ()
-  call void asm sideeffect "", "~{SGPR64_SGPR65_SGPR66_SGPR67_SGPR68_SGPR69_SGPR70_SGPR71}" ()
-  call void asm sideeffect "", "~{SGPR72_SGPR73_SGPR74_SGPR75_SGPR76_SGPR77_SGPR78_SGPR79}" ()
-  call void asm sideeffect "", "~{SGPR80_SGPR81_SGPR82_SGPR83_SGPR84_SGPR85_SGPR86_SGPR87}" ()
-  call void asm sideeffect "", "~{SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95}" ()
-  call void asm sideeffect "", "~{VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}" ()
-  call void asm sideeffect "", "~{VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}" ()
-  call void asm sideeffect "", "~{VGPR16_VGPR17_VGPR18_VGPR19_VGPR20_VGPR21_VGPR22_VGPR23}" ()
-  call void asm sideeffect "", "~{VGPR24_VGPR25_VGPR26_VGPR27_VGPR28_VGPR29_VGPR30_VGPR31}" ()
-  call void asm sideeffect "", "~{VGPR32_VGPR33_VGPR34_VGPR35_VGPR36_VGPR37_VGPR38_VGPR39}" ()
-  call void asm sideeffect "", "~{VGPR40_VGPR41_VGPR42_VGPR43_VGPR44_VGPR45_VGPR46_VGPR47}" ()
-  call void asm sideeffect "", "~{VGPR48_VGPR49_VGPR50_VGPR51_VGPR52_VGPR53_VGPR54_VGPR55}" ()
-  call void asm sideeffect "", "~{VGPR56_VGPR57_VGPR58_VGPR59_VGPR60_VGPR61_VGPR62_VGPR63}" ()
-  call void asm sideeffect "", "~{VGPR64_VGPR65_VGPR66_VGPR67_VGPR68_VGPR69_VGPR70_VGPR71}" ()
-  call void asm sideeffect "", "~{VGPR72_VGPR73_VGPR74_VGPR75_VGPR76_VGPR77_VGPR78_VGPR79}" ()
-  call void asm sideeffect "", "~{VGPR80_VGPR81_VGPR82_VGPR83_VGPR84_VGPR85_VGPR86_VGPR87}" ()
-  call void asm sideeffect "", "~{VGPR88_VGPR89_VGPR90_VGPR91_VGPR92_VGPR93_VGPR94_VGPR95}" ()
-  call void asm sideeffect "", "~{VGPR96_VGPR97_VGPR98_VGPR99_VGPR100_VGPR101_VGPR102_VGPR103}" ()
-  call void asm sideeffect "", "~{VGPR104_VGPR105_VGPR106_VGPR107_VGPR108_VGPR109_VGPR110_VGPR111}" ()
-  call void asm sideeffect "", "~{VGPR112_VGPR113_VGPR114_VGPR115_VGPR116_VGPR117_VGPR118_VGPR119}" ()
-  call void asm sideeffect "", "~{VGPR120_VGPR121_VGPR122_VGPR123_VGPR124_VGPR125_VGPR126_VGPR127}" ()
-  call void asm sideeffect "", "~{VGPR128_VGPR129_VGPR130_VGPR131_VGPR132_VGPR133_VGPR134_VGPR135}" ()
-  call void asm sideeffect "", "~{VGPR136_VGPR137_VGPR138_VGPR139_VGPR140_VGPR141_VGPR142_VGPR143}" ()
-  call void asm sideeffect "", "~{VGPR144_VGPR145_VGPR146_VGPR147_VGPR148_VGPR149_VGPR150_VGPR151}" ()
-  call void asm sideeffect "", "~{VGPR152_VGPR153_VGPR154_VGPR155_VGPR156_VGPR157_VGPR158_VGPR159}" ()
-  call void asm sideeffect "", "~{VGPR160_VGPR161_VGPR162_VGPR163_VGPR164_VGPR165_VGPR166_VGPR167}" ()
-  call void asm sideeffect "", "~{VGPR168_VGPR169_VGPR170_VGPR171_VGPR172_VGPR173_VGPR174_VGPR175}" ()
-  call void asm sideeffect "", "~{VGPR176_VGPR177_VGPR178_VGPR179_VGPR180_VGPR181_VGPR182_VGPR183}" ()
-  call void asm sideeffect "", "~{VGPR184_VGPR185_VGPR186_VGPR187_VGPR188_VGPR189_VGPR190_VGPR191}" ()
-  call void asm sideeffect "", "~{VGPR192_VGPR193_VGPR194_VGPR195_VGPR196_VGPR197_VGPR198_VGPR199}" ()
-  call void asm sideeffect "", "~{VGPR200_VGPR201_VGPR202_VGPR203_VGPR204_VGPR205_VGPR206_VGPR207}" ()
-  call void asm sideeffect "", "~{VGPR208_VGPR209_VGPR210_VGPR211_VGPR212_VGPR213_VGPR214_VGPR215}" ()
-  call void asm sideeffect "", "~{VGPR216_VGPR217_VGPR218_VGPR219_VGPR220_VGPR221_VGPR222_VGPR223}" ()
-  call void asm sideeffect "", "~{VGPR224_VGPR225_VGPR226_VGPR227_VGPR228_VGPR229_VGPR230_VGPR231}" ()
-  call void asm sideeffect "", "~{VGPR232_VGPR233_VGPR234_VGPR235_VGPR236_VGPR237_VGPR238_VGPR239}" ()
-  call void asm sideeffect "", "~{VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247}" ()
-  call void asm sideeffect "", "~{VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255}" ()
+  call void asm sideeffect "", "~{s[0:7]}" ()
+  call void asm sideeffect "", "~{s[8:15]}" ()
+  call void asm sideeffect "", "~{s[16:23]}" ()
+  call void asm sideeffect "", "~{s[24:31]}" ()
+  call void asm sideeffect "", "~{s[32:39]}" ()
+  call void asm sideeffect "", "~{s[40:47]}" ()
+  call void asm sideeffect "", "~{s[48:55]}" ()
+  call void asm sideeffect "", "~{s[56:63]}" ()
+  call void asm sideeffect "", "~{s[64:71]}" ()
+  call void asm sideeffect "", "~{s[72:79]}" ()
+  call void asm sideeffect "", "~{s[80:87]}" ()
+  call void asm sideeffect "", "~{s[88:95]}" ()
+  call void asm sideeffect "", "~{v[0:7]}" ()
+  call void asm sideeffect "", "~{v[8:15]}" ()
+  call void asm sideeffect "", "~{v[16:23]}" ()
+  call void asm sideeffect "", "~{v[24:31]}" ()
+  call void asm sideeffect "", "~{v[32:39]}" ()
+  call void asm sideeffect "", "~{v[40:47]}" ()
+  call void asm sideeffect "", "~{v[48:55]}" ()
+  call void asm sideeffect "", "~{v[56:63]}" ()
+  call void asm sideeffect "", "~{v[64:71]}" ()
+  call void asm sideeffect "", "~{v[72:79]}" ()
+  call void asm sideeffect "", "~{v[80:87]}" ()
+  call void asm sideeffect "", "~{v[88:95]}" ()
+  call void asm sideeffect "", "~{v[96:103]}" ()
+  call void asm sideeffect "", "~{v[104:111]}" ()
+  call void asm sideeffect "", "~{v[112:119]}" ()
+  call void asm sideeffect "", "~{v[120:127]}" ()
+  call void asm sideeffect "", "~{v[128:135]}" ()
+  call void asm sideeffect "", "~{v[136:143]}" ()
+  call void asm sideeffect "", "~{v[144:151]}" ()
+  call void asm sideeffect "", "~{v[152:159]}" ()
+  call void asm sideeffect "", "~{v[160:167]}" ()
+  call void asm sideeffect "", "~{v[168:175]}" ()
+  call void asm sideeffect "", "~{v[176:183]}" ()
+  call void asm sideeffect "", "~{v[184:191]}" ()
+  call void asm sideeffect "", "~{v[192:199]}" ()
+  call void asm sideeffect "", "~{v[200:207]}" ()
+  call void asm sideeffect "", "~{v[208:215]}" ()
+  call void asm sideeffect "", "~{v[216:223]}" ()
+  call void asm sideeffect "", "~{v[224:231]}" ()
+  call void asm sideeffect "", "~{v[232:239]}" ()
+  call void asm sideeffect "", "~{v[240:247]}" ()
+  call void asm sideeffect "", "~{v[248:255]}" ()
 
   store i32 %in, i32 addrspace(1)* %out
   ret void
Index: test/CodeGen/AMDGPU/skip-if-dead.ll
===================================================================
--- test/CodeGen/AMDGPU/skip-if-dead.ll
+++ test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -79,7 +79,7 @@
 ; CHECK-NEXT: s_endpgm
 define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
   call void @llvm.AMDGPU.kill(float %x)
-  %y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={VGPR7}"()
+  %y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={v7}"()
   call void @llvm.AMDGPU.kill(float %y)
   ret void
 }
@@ -128,7 +128,7 @@
     v_nop_e64
     v_nop_e64
     v_nop_e64
-    v_nop_e64", "={VGPR7}"()
+    v_nop_e64", "={v7}"()
   call void @llvm.AMDGPU.kill(float %var)
   br label %exit
 
@@ -186,11 +186,11 @@
     v_nop_e64
     v_nop_e64
     v_nop_e64
-    v_nop_e64", "={VGPR7}"()
-  %live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={VGPR8}"()
+    v_nop_e64", "={v7}"()
+  %live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={v8}"()
   call void @llvm.AMDGPU.kill(float %var)
   store volatile float %live.across, float addrspace(1)* undef
-  %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={VGPR9}"()
+  %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={v9}"()
   br label %exit
 
 exit:
@@ -242,7 +242,7 @@
     v_nop_e64
     v_nop_e64
     v_nop_e64
-    v_nop_e64", "={VGPR7}"()
+    v_nop_e64", "={v7}"()
   call void @llvm.AMDGPU.kill(float %var)
   %vgpr = load volatile i32, i32 addrspace(1)* undef
   %loop.cond = icmp eq i32 %vgpr, 0
Index: test/CodeGen/AMDGPU/spill-scavenge-offset.ll
===================================================================
--- test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -20,13 +20,13 @@
   %a = load <1280 x i32>, <1280 x i32> addrspace(1)* %aptr
 
 ; mark most VGPR registers as used to increase register pressure
-  call void asm sideeffect "", "~{VGPR4},~{VGPR8},~{VGPR12},~{VGPR16},~{VGPR20},~{VGPR24},~{VGPR28},~{VGPR32}" ()
-  call void asm sideeffect "", "~{VGPR36},~{VGPR40},~{VGPR44},~{VGPR48},~{VGPR52},~{VGPR56},~{VGPR60},~{VGPR64}" ()
-  call void asm sideeffect "", "~{VGPR68},~{VGPR72},~{VGPR76},~{VGPR80},~{VGPR84},~{VGPR88},~{VGPR92},~{VGPR96}" ()
-  call void asm sideeffect "", "~{VGPR100},~{VGPR104},~{VGPR108},~{VGPR112},~{VGPR116},~{VGPR120},~{VGPR124},~{VGPR128}" ()
-  call void asm sideeffect "", "~{VGPR132},~{VGPR136},~{VGPR140},~{VGPR144},~{VGPR148},~{VGPR152},~{VGPR156},~{VGPR160}" ()
-  call void asm sideeffect "", "~{VGPR164},~{VGPR168},~{VGPR172},~{VGPR176},~{VGPR180},~{VGPR184},~{VGPR188},~{VGPR192}" ()
-  call void asm sideeffect "", "~{VGPR196},~{VGPR200},~{VGPR204},~{VGPR208},~{VGPR212},~{VGPR216},~{VGPR220},~{VGPR224}" ()
+  call void asm sideeffect "", "~{v4},~{v8},~{v12},~{v16},~{v20},~{v24},~{v28},~{v32}" ()
+  call void asm sideeffect "", "~{v36},~{v40},~{v44},~{v48},~{v52},~{v56},~{v60},~{v64}" ()
+  call void asm sideeffect "", "~{v68},~{v72},~{v76},~{v80},~{v84},~{v88},~{v92},~{v96}" ()
+  call void asm sideeffect "", "~{v100},~{v104},~{v108},~{v112},~{v116},~{v120},~{v124},~{v128}" ()
+  call void asm sideeffect "", "~{v132},~{v136},~{v140},~{v144},~{v148},~{v152},~{v156},~{v160}" ()
+  call void asm sideeffect "", "~{v164},~{v168},~{v172},~{v176},~{v180},~{v184},~{v188},~{v192}" ()
+  call void asm sideeffect "", "~{v196},~{v200},~{v204},~{v208},~{v212},~{v216},~{v220},~{v224}" ()
 
   %outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %out, i32 %tid
   store <1280 x i32> %a, <1280 x i32> addrspace(1)* %outptr
Index: test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
===================================================================
--- test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -73,14 +73,14 @@
 
 ; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}}
 define amdgpu_kernel void @partially_undef_copy() #0 {
-  %tmp0 = call i32 asm sideeffect "v_mov_b32_e32 v5, 5", "={VGPR5}"()
-  %tmp1 = call i32 asm sideeffect "v_mov_b32_e32 v6, 6", "={VGPR6}"()
+  %tmp0 = call i32 asm sideeffect "v_mov_b32_e32 v5, 5", "={v5}"()
+  %tmp1 = call i32 asm sideeffect "v_mov_b32_e32 v6, 6", "={v6}"()
 
   %partially.undef.0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
   %partially.undef.1 = insertelement <4 x i32> %partially.undef.0, i32 %tmp1, i32 0
 
   store volatile <4 x i32> %partially.undef.1, <4 x i32> addrspace(1)* undef, align 16
-  tail call void asm sideeffect "v_nop", "v={VGPR5_VGPR6_VGPR7_VGPR8}"(<4 x i32> %partially.undef.0)
+  tail call void asm sideeffect "v_nop", "v={v[5:8]}"(<4 x i32> %partially.undef.0)
   ret void
 }
 
Index: test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir
@@ -0,0 +1,149 @@
+# RUN: llc -O0 -mtriple arm-- -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  define void @test_mla() #0 { ret void }
+  define void @test_mla_v5() #1 { ret void }
+
+  define void @test_mls() #2 { ret void }
+  define void @test_no_mls() { ret void }
+
+  attributes #0 = { "target-features"="+v6" }
+  attributes #1 = { "target-features"="-v6" }
+  attributes #2 = { "target-features"="+v6t2" }
+...
+---
+name:            test_mla
+# CHECK-LABEL: name: test_mla
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2
+
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(s32) = COPY %r2
+    ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0
+    ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
+    ; CHECK: [[VREGZ:%[0-9]+]] = COPY %r2
+
+    %3(s32) = G_MUL %0, %1
+    %4(s32) = G_ADD %3, %2
+    ; CHECK: [[VREGR:%[0-9]+]] = MLA [[VREGX]], [[VREGY]], [[VREGZ]], 14, _, _
+
+    %r0 = COPY %4(s32)
+    ; CHECK: %r0 = COPY [[VREGR]]
+
+    BX_RET 14, _, implicit %r0
+    ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name:            test_mla_v5
+# CHECK-LABEL: name: test_mla_v5
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2
+
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(s32) = COPY %r2
+    ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0
+    ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
+    ; CHECK: [[VREGZ:%[0-9]+]] = COPY %r2
+
+    %3(s32) = G_MUL %0, %1
+    %4(s32) = G_ADD %3, %2
+    ; CHECK: [[VREGR:%[0-9]+]] = MLAv5 [[VREGX]], [[VREGY]], [[VREGZ]], 14, _, _
+
+    %r0 = COPY %4(s32)
+    ; CHECK: %r0 = COPY [[VREGR]]
+
+    BX_RET 14, _, implicit %r0
+    ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name:            test_mls
+# CHECK-LABEL: name: test_mls
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2
+
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(s32) = COPY %r2
+    ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0
+    ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
+    ; CHECK: [[VREGZ:%[0-9]+]] = COPY %r2
+
+    %3(s32) = G_MUL %0, %1
+    %4(s32) = G_SUB %2, %3
+    ; CHECK: [[VREGR:%[0-9]+]] = MLS [[VREGX]], [[VREGY]], [[VREGZ]], 14, _
+
+    %r0 = COPY %4(s32)
+    ; CHECK: %r0 = COPY [[VREGR]]
+
+    BX_RET 14, _, implicit %r0
+    ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name:            test_no_mls
+# CHECK-LABEL: name: test_no_mls
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2
+
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(s32) = COPY %r2
+    ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0
+    ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
+    ; CHECK: [[VREGZ:%[0-9]+]] = COPY %r2
+
+    %3(s32) = G_MUL %0, %1
+    %4(s32) = G_SUB %2, %3
+    ; CHECK: [[VREGM:%[0-9]+]] = MULv5 [[VREGX]], [[VREGY]], 14, _, _
+    ; CHECK: [[VREGR:%[0-9]+]] = SUBrr [[VREGZ]], [[VREGM]], 14, _, _
+
+    %r0 = COPY %4(s32)
+    ; CHECK: %r0 = COPY [[VREGR]]
+
+    BX_RET 14, _, implicit %r0
+    ; CHECK: BX_RET 14, _, implicit %r0
+...
Index: test/CodeGen/Hexagon/mux-undef.ll
===================================================================
--- /dev/null
+++ test/CodeGen/Hexagon/mux-undef.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=hexagon -verify-machineinstrs < %s | FileCheck %s
+;
+; Make sure this test compiles successfully.
+; CHECK: jumpr r31
+
+target triple = "hexagon--elf"
+
+; Function Attrs: nounwind
+define i32 @fred() #0 {
+b0:
+  call void @foo() #0
+  br label %b1
+
+b1:                                               ; preds = %b0
+  br i1 undef, label %b2, label %b3
+
+b2:                                               ; preds = %b1
+  br label %b3
+
+b3:                                               ; preds = %b2, %b1
+  %v4 = phi i32 [ 1, %b1 ], [ 2, %b2 ]
+  ret i32 %v4
+}
+
+declare void @foo() #0
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" }
Index: test/CodeGen/PowerPC/BoolRetToIntTest-2.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/BoolRetToIntTest-2.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=powerpc64le-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+
+; https://bugs.llvm.org/show_bug.cgi?id=32442
+; Don't generate zero extension for the return value.
+; CHECK-NOT: clrldi
+
+define zeroext i1 @foo(i32 signext %i, i32* %p) {
+entry:
+  %cmp = icmp eq i32 %i, 0
+  br i1 %cmp, label %return, label %if.end
+
+if.end:
+  store i32 %i, i32* %p, align 4
+  br label %return
+
+return:
+  %retval = phi i1 [ true, %if.end ], [ false, %entry ]
+  ret i1 %retval
+}
Index: test/CodeGen/PowerPC/BoolRetToIntTest.ll
===================================================================
--- test/CodeGen/PowerPC/BoolRetToIntTest.ll
+++ test/CodeGen/PowerPC/BoolRetToIntTest.ll
@@ -31,14 +31,14 @@
   br i1 %call, label %cleanup.loopexit, label %for.cond
 
 cleanup.loopexit:                                 ; preds = %for.body, %for.cond
-; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+; CHECK: [[PHI:%.+]] = phi i64 [ 1, %for.body ], [ 0, %for.cond ]
   %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
   br label %cleanup
 
 cleanup:                                          ; preds = %cleanup.loopexit, %entry
-; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+; CHECK: = phi i64 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
   %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
-; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: [[REG:%.+]] = trunc i64 {{%.+}} to i1
 ; CHECK: ret i1 [[REG]]
   ret i1 %cleanup.dest.slot.0
 }
@@ -78,14 +78,14 @@
   br i1 %call, label %cleanup.loopexit, label %for.cond
 
 cleanup.loopexit:                                 ; preds = %for.body, %for.cond
-; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+; CHECK: [[PHI:%.+]] = phi i64 [ 1, %for.body ], [ 0, %for.cond ]
   %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
   br label %cleanup
 
 cleanup:                                          ; preds = %cleanup.loopexit, %entry
-; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+; CHECK: = phi i64 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
   %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
-; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: [[REG:%.+]] = trunc i64 {{%.+}} to i1
 ; CHECK: call void %cont(i1 [[REG]]
   tail call void %cont(i1 %cleanup.dest.slot.0)
   ret void
@@ -112,17 +112,17 @@
   br i1 %call, label %cleanup.loopexit, label %for.cond
 
 cleanup.loopexit:                                 ; preds = %for.body, %for.cond
-; CHECK: [[PHI:%.+]] = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+; CHECK: [[PHI:%.+]] = phi i64 [ 1, %for.body ], [ 0, %for.cond ]
   %cleanup.dest.slot.0.ph = phi i1 [ true, %for.body ], [ false, %for.cond ]
   br label %cleanup
 
 cleanup:                                          ; preds = %cleanup.loopexit, %entry
-; CHECK: = phi i32 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
+; CHECK: = phi i64 [ 0, %entry ], [ [[PHI]], %cleanup.loopexit ]
   %cleanup.dest.slot.0 = phi i1 [ false, %entry ], [ %cleanup.dest.slot.0.ph, %cleanup.loopexit ]
-; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: [[REG:%.+]] = trunc i64 {{%.+}} to i1
 ; CHECK: call void %cont(i1 [[REG]]
   tail call void %cont(i1 %cleanup.dest.slot.0)
-; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: [[REG:%.+]] = trunc i64 {{%.+}} to i1
 ; CHECK: ret i1 [[REG]]
   ret i1 %cleanup.dest.slot.0
 }
@@ -136,7 +136,7 @@
   br label %cleanup
 
 cleanup:
-; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: [[REG:%.+]] = trunc i64 {{%.+}} to i1
 ; CHECK: ret i1 [[REG]]
   %result = phi i1 [ false, %foo ], [ %operand, %entry ]
   ret i1 %result
@@ -186,7 +186,7 @@
 
 ; CHECK-LABEL: cleanup
 cleanup:
-; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: [[REG:%.+]] = trunc i64 {{%.+}} to i1
 ; CHECK: ret i1 [[REG]]
   %result = phi i1 [ %bar, %foo], [ %operand, %entry ]
   ret i1 %result
@@ -198,8 +198,8 @@
 define zeroext i1 @call_test() {
 ; CHECK: [[REG:%.+]] = call i1
   %result = call i1 @return_i1()
-; CHECK: [[REG:%.+]] = zext i1 {{%.+}} to i32
-; CHECK: [[REG:%.+]] = trunc i32 {{%.+}} to i1
+; CHECK: [[REG:%.+]] = zext i1 {{%.+}} to i64
+; CHECK: [[REG:%.+]] = trunc i64 {{%.+}} to i1
 ; CHECK: ret i1 [[REG]]
   ret i1 %result
 }
Index: test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
===================================================================
--- test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
+++ test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -17,13 +17,13 @@
 ; Check 4 bytes - requires 1 load for each param.
 define signext i32 @zeroEqualityTest02(i8* %x, i8* %y) {
 ; CHECK-LABEL: zeroEqualityTest02:
-; CHECK:       # BB#0: # %loadbb
+; CHECK:       # BB#0:
 ; CHECK-NEXT:    lwz 3, 0(3)
 ; CHECK-NEXT:    lwz 4, 0(4)
-; CHECK-NEXT:    li 5, 1
-; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    isel 3, 0, 5, 2
-; CHECK-NEXT:    clrldi 3, 3, 32
+; CHECK-NEXT:    xor 3, 3, 4
+; CHECK-NEXT:    cntlzw 3, 3
+; CHECK-NEXT:    srwi 3, 3, 5
+; CHECK-NEXT:    xori 3, 3, 1
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 4)
   %not.cmp = icmp ne i32 %call, 0
@@ -196,3 +196,27 @@
   ret i32 %cond
 }
 
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) {
+; CHECK-LABEL: length2_eq_nobuiltin_attr:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    std 0, 16(1)
+; CHECK-NEXT:    stdu 1, -32(1)
+; CHECK-NEXT:  .Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:  .Lcfi1:
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    li 5, 2
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    cntlzw 3, 3
+; CHECK-NEXT:    rlwinm 3, 3, 27, 31, 31
+; CHECK-NEXT:    addi 1, 1, 32
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+  %m = tail call signext i32 @memcmp(i8* %X, i8* %Y, i64 2) nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
Index: test/CodeGen/PowerPC/vec_int_ext.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/vec_int_ext.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 < %s | FileCheck %s -check-prefix=PWR9
+target triple = "powerpc64le-unknown-linux-gnu"
+
+define <4 x i32> @vextsb2w(<16 x i8> %a) {
+; PWR9-LABEL: vextsb2w:
+; PWR9:       # BB#0: # %entry
+; PWR9-NEXT:    vextsb2w 2, 2
+; PWR9-NEXT:    blr
+entry:
+  %vecext = extractelement <16 x i8> %a, i32 0
+  %conv = sext i8 %vecext to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %vecext1 = extractelement <16 x i8> %a, i32 4
+  %conv2 = sext i8 %vecext1 to i32
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+  %vecext4 = extractelement <16 x i8> %a, i32 8
+  %conv5 = sext i8 %vecext4 to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+  %vecext7 = extractelement <16 x i8> %a, i32 12
+  %conv8 = sext i8 %vecext7 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+  ret <4 x i32> %vecinit9
+}
+
+define <2 x i64> @vextsb2d(<16 x i8> %a) {
+; PWR9-LABEL: vextsb2d:
+; PWR9:       # BB#0: # %entry
+; PWR9-NEXT:    vextsb2d 2, 2
+; PWR9-NEXT:    blr
+entry:
+  %vecext = extractelement <16 x i8> %a, i32 0
+  %conv = sext i8 %vecext to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %vecext1 = extractelement <16 x i8> %a, i32 8
+  %conv2 = sext i8 %vecext1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+}
+
+define <4 x i32> @vextsh2w(<8 x i16> %a) {
+; PWR9-LABEL: vextsh2w:
+; PWR9:       # BB#0: # %entry
+; PWR9-NEXT:    vextsh2w 2, 2
+; PWR9-NEXT:    blr
+entry:
+  %vecext = extractelement <8 x i16> %a, i32 0
+  %conv = sext i16 %vecext to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %vecext1 = extractelement <8 x i16> %a, i32 2
+  %conv2 = sext i16 %vecext1 to i32
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+  %vecext4 = extractelement <8 x i16> %a, i32 4
+  %conv5 = sext i16 %vecext4 to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+  %vecext7 = extractelement <8 x i16> %a, i32 6
+  %conv8 = sext i16 %vecext7 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+  ret <4 x i32> %vecinit9
+}
+
+define <2 x i64> @vextsh2d(<8 x i16> %a) {
+; PWR9-LABEL: vextsh2d:
+; PWR9:       # BB#0: # %entry
+; PWR9-NEXT:    vextsh2d 2, 2
+; PWR9-NEXT:    blr
+entry:
+  %vecext = extractelement <8 x i16> %a, i32 0
+  %conv = sext i16 %vecext to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %vecext1 = extractelement <8 x i16> %a, i32 4
+  %conv2 = sext i16 %vecext1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+}
+
+define <2 x i64> @vextsw2d(<4 x i32> %a) {
+; PWR9-LABEL: vextsw2d:
+; PWR9:       # BB#0: # %entry
+; PWR9-NEXT:    vextsw2d 2, 2
+; PWR9-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 0
+  %conv = sext i32 %vecext to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %vecext1 = extractelement <4 x i32> %a, i32 2
+  %conv2 = sext i32 %vecext1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+}
Index: test/CodeGen/WebAssembly/offset-fastisel.ll
===================================================================
--- /dev/null
+++ test/CodeGen/WebAssembly/offset-fastisel.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-explicit-locals -fast-isel | FileCheck %s
+
+; TODO: Merge this with offset.ll when fast-isel matches better.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown-elf"
+
+; CHECK-LABEL: store_i8_with_variable_gep_offset:
+; CHECK: i32.add    $push[[L0:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i32.const  $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK: i32.store8 0($pop[[L0]]), $pop[[L1]]{{$}}
+define void @store_i8_with_variable_gep_offset(i8* %p, i32 %idx) {
+  %s = getelementptr inbounds i8, i8* %p, i32 %idx
+  store i8 0, i8* %s
+  ret void
+}
+
+; CHECK-LABEL: store_i8_with_array_alloca_gep:
+; CHECK: i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK: i32.load    $push[[L1:[0-9]+]]=, __stack_pointer($pop[[L0]]){{$}}
+; CHECK: i32.const   $push[[L2:[0-9]+]]=, 32{{$}}
+; CHECK: i32.sub     $push{{[0-9]+}}=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK: i32.add     $push[[L4:[0-9]+]]=, $pop{{[0-9]+}}, $0{{$}}
+; CHECK: i32.const   $push[[L5:[0-9]+]]=, 0{{$}}
+; CHECK: i32.store8  0($pop[[L4]]), $pop[[L5]]{{$}}
+define hidden void @store_i8_with_array_alloca_gep(i32 %idx) {
+  %A = alloca [30 x i8], align 16
+  %s = getelementptr inbounds [30 x i8], [30 x i8]* %A, i32 0, i32 %idx
+  store i8 0, i8* %s, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_i32_with_unfolded_gep_offset:
+; CHECK: i32.const $push[[L0:[0-9]+]]=, 24{{$}}
+; CHECK: i32.add   $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK: i32.const $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK: i32.store 0($pop[[L1]]), $pop[[L2]]{{$}}
+define void @store_i32_with_unfolded_gep_offset(i32* %p) {
+  %s = getelementptr i32, i32* %p, i32 6
+  store i32 0, i32* %s
+  ret void
+}
+
+; CHECK-LABEL: store_i32_with_folded_gep_offset:
+; CHECK: i32.store 24($0), $pop{{[0-9]+$}}
+define void @store_i32_with_folded_gep_offset(i32* %p) {
+  %s = getelementptr inbounds i32, i32* %p, i32 6
+  store i32 0, i32* %s
+  ret void
+}
+
+; CHECK-LABEL: load_i32_with_folded_gep_offset:
+; CHECK: i32.load  $push{{[0-9]+}}=, 24($0){{$}}
+define i32 @load_i32_with_folded_gep_offset(i32* %p) {
+  %s = getelementptr inbounds i32, i32* %p, i32 6
+  %t = load i32, i32* %s
+  ret i32 %t
+}
+
+; CHECK-LABEL: store_i64_with_unfolded_gep_offset:
+; CHECK: i32.const $push[[L0:[0-9]+]]=, 24{{$}}
+; CHECK: i32.add   $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK: i64.const $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK: i64.store 0($pop[[L1]]), $pop[[L2]]{{$}}
+define void @store_i64_with_unfolded_gep_offset(i64* %p) {
+  %s = getelementptr i64, i64* %p, i32 3
+  store i64 0, i64* %s
+  ret void
+}
+
+; CHECK-LABEL: store_i8_with_folded_gep_offset:
+; CHECK: i32.store8 24($0), $pop{{[0-9]+$}}
+define void @store_i8_with_folded_gep_offset(i8* %p) {
+  %s = getelementptr inbounds i8, i8* %p, i32 24
+  store i8 0, i8* %s
+  ret void
+}
+
+; CHECK-LABEL: load_i8_u_with_folded_offset:
+; CHECK: i32.load8_u $push{{[0-9]+}}=, 24($0){{$}}
+define i32 @load_i8_u_with_folded_offset(i8* %p) {
+  %q = ptrtoint i8* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i8*
+  %t = load i8, i8* %s
+  %u = zext i8 %t to i32
+  ret i32 %u
+}
+
+; TODO: this should be load8_s, need to fold sign-/zero-extend in fast-isel
+; CHECK-LABEL: load_i8_s_with_folded_offset:
+; CHECK: i32.load8_u $push{{[0-9]+}}=, 24($0){{$}}
+define i32 @load_i8_s_with_folded_offset(i8* %p) {
+  %q = ptrtoint i8* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i8*
+  %t = load i8, i8* %s
+  %u = sext i8 %t to i32
+  ret i32 %u
+}
Index: test/CodeGen/X86/avx-schedule.ll
===================================================================
--- test/CodeGen/X86/avx-schedule.ll
+++ test/CodeGen/X86/avx-schedule.ll
@@ -910,14 +910,14 @@
 ;
 ; BTVER2-LABEL: test_haddpd:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_haddpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
 ; ZNVER1-NEXT:    retq # sched: [4:1.00]
   %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -941,14 +941,14 @@
 ;
 ; BTVER2-LABEL: test_haddps:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_haddps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
 ; ZNVER1-NEXT:    retq # sched: [4:1.00]
   %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
@@ -972,14 +972,14 @@
 ;
 ; BTVER2-LABEL: test_hsubpd:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_hsubpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
 ; ZNVER1-NEXT:    retq # sched: [4:1.00]
   %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
@@ -1003,14 +1003,14 @@
 ;
 ; BTVER2-LABEL: test_hsubps:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_hsubps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; ZNVER1-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
 ; ZNVER1-NEXT:    retq # sched: [4:1.00]
   %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
Index: test/CodeGen/X86/memcmp.ll
===================================================================
--- test/CodeGen/X86/memcmp.ll
+++ test/CodeGen/X86/memcmp.ll
@@ -10,9 +10,28 @@
 
 declare i32 @memcmp(i8*, i8*, i64)
 
-define i1 @length2(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
+define i32 @length2(i8* %X, i8* %Y) nounwind {
 ; X32-LABEL: length2:
 ; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $2
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: length2:
+; X64:       # BB#0:
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
+; X32-LABEL: length2_eq:
+; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movzwl (%ecx), %ecx
@@ -20,7 +39,7 @@
 ; X32-NEXT:    sete %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: length2:
+; X64-LABEL: length2_eq:
 ; X64:       # BB#0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    cmpw (%rsi), %ax
@@ -31,8 +50,8 @@
   ret i1 %c
 }
 
-define i1 @length2_const(i8* %X, i32* nocapture %P) nounwind {
-; X32-LABEL: length2_const:
+define i1 @length2_eq_const(i8* %X) nounwind {
+; X32-LABEL: length2_eq_const:
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movzwl (%eax), %eax
@@ -40,7 +59,7 @@
 ; X32-NEXT:    setne %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: length2_const:
+; X64-LABEL: length2_eq_const:
 ; X64:       # BB#0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
@@ -51,8 +70,8 @@
   ret i1 %c
 }
 
-define i1 @length2_nobuiltin_attr(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
-; X32-LABEL: length2_nobuiltin_attr:
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
+; X32-LABEL: length2_eq_nobuiltin_attr:
 ; X32:       # BB#0:
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $2
@@ -64,7 +83,7 @@
 ; X32-NEXT:    sete %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: length2_nobuiltin_attr:
+; X64-LABEL: length2_eq_nobuiltin_attr:
 ; X64:       # BB#0:
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    movl $2, %edx
@@ -78,9 +97,74 @@
   ret i1 %c
 }
 
-define i1 @length4(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
+define i32 @length3(i8* %X, i8* %Y) nounwind {
+; X32-LABEL: length3:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $3
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: length3:
+; X64:       # BB#0:
+; X64-NEXT:    movl $3, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
+; X32-LABEL: length3_eq:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $3
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    setne %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length3_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $3, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind {
 ; X32-LABEL: length4:
 ; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $4
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: length4:
+; X64:       # BB#0:
+; X64-NEXT:    movl $4, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
+; X32-LABEL: length4_eq:
+; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl (%ecx), %ecx
@@ -88,7 +172,7 @@
 ; X32-NEXT:    setne %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: length4:
+; X64-LABEL: length4_eq:
 ; X64:       # BB#0:
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    cmpl (%rsi), %eax
@@ -99,15 +183,15 @@
   ret i1 %c
 }
 
-define i1 @length4_const(i8* %X, i32* nocapture %P) nounwind {
-; X32-LABEL: length4_const:
+define i1 @length4_eq_const(i8* %X) nounwind {
+; X32-LABEL: length4_eq_const:
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
 ; X32-NEXT:    sete %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: length4_const:
+; X64-LABEL: length4_eq_const:
 ; X64:       # BB#0:
 ; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
 ; X64-NEXT:    sete %al
@@ -117,7 +201,53 @@
   ret i1 %c
 }
 
-define i1 @length8(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
+define i32 @length5(i8* %X, i8* %Y) nounwind {
+; X32-LABEL: length5:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $5
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: length5:
+; X64:       # BB#0:
+; X64-NEXT:    movl $5, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
+; X32-LABEL: length5_eq:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $5
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    setne %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length5_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $5, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind {
 ; X32-LABEL: length8:
 ; X32:       # BB#0:
 ; X32-NEXT:    pushl $0
@@ -126,11 +256,30 @@
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X32-NEXT:    calll memcmp
 ; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: length8:
+; X64:       # BB#0:
+; X64-NEXT:    movl $8, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
+; X32-LABEL: length8_eq:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $8
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
 ; X32-NEXT:    testl %eax, %eax
 ; X32-NEXT:    sete %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: length8:
+; X64-LABEL: length8_eq:
 ; X64:       # BB#0:
 ; X64-NEXT:    movq (%rdi), %rax
 ; X64-NEXT:    cmpq (%rsi), %rax
@@ -141,8 +290,8 @@
   ret i1 %c
 }
 
-define i1 @length8_const(i8* %X, i32* nocapture %P) nounwind {
-; X32-LABEL: length8_const:
+define i1 @length8_eq_const(i8* %X) nounwind {
+; X32-LABEL: length8_eq_const:
 ; X32:       # BB#0:
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $8
@@ -154,7 +303,7 @@
 ; X32-NEXT:    setne %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: length8_const:
+; X64-LABEL: length8_eq_const:
 ; X64:       # BB#0:
 ; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
 ; X64-NEXT:    cmpq %rax, (%rdi)
@@ -165,7 +314,55 @@
   ret i1 %c
 }
 
-define i1 @length16(i8* %x, i8* %y) nounwind {
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
+; X32-LABEL: length12_eq:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $12
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    setne %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length12_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $12, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind {
+; X32-LABEL: length12:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $12
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: length12:
+; X64:       # BB#0:
+; X64-NEXT:    movl $12, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind {
 ; X32-LABEL: length16:
 ; X32:       # BB#0:
 ; X32-NEXT:    pushl $0
@@ -174,11 +371,30 @@
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X32-NEXT:    calll memcmp
 ; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: length16:
+; X64:       # BB#0:
+; X64-NEXT:    movl $16, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind {
+; X32-LABEL: length16_eq:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $16
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
 ; X32-NEXT:    testl %eax, %eax
 ; X32-NEXT:    setne %al
 ; X32-NEXT:    retl
 ;
-; SSE2-LABEL: length16:
+; SSE2-LABEL: length16_eq:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqu (%rsi), %xmm0
 ; SSE2-NEXT:    movdqu (%rdi), %xmm1
@@ -188,7 +404,7 @@
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: length16:
+; AVX2-LABEL: length16_eq:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
 ; AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
@@ -201,8 +417,8 @@
   ret i1 %cmp
 }
 
-define i1 @length16_const(i8* %X, i32* nocapture %P) nounwind {
-; X32-LABEL: length16_const:
+define i1 @length16_eq_const(i8* %X) nounwind {
+; X32-LABEL: length16_eq_const:
 ; X32:       # BB#0:
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $16
@@ -214,7 +430,7 @@
 ; X32-NEXT:    sete %al
 ; X32-NEXT:    retl
 ;
-; SSE2-LABEL: length16_const:
+; SSE2-LABEL: length16_eq_const:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
 ; SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
@@ -223,7 +439,7 @@
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: length16_const:
+; AVX2-LABEL: length16_eq_const:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
 ; AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
@@ -236,7 +452,7 @@
   ret i1 %c
 }
 
-define i1 @length32(i8* %x, i8* %y) nounwind {
+define i32 @length32(i8* %X, i8* %Y) nounwind {
 ; X32-LABEL: length32:
 ; X32:       # BB#0:
 ; X32-NEXT:    pushl $0
@@ -245,11 +461,32 @@
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X32-NEXT:    calll memcmp
 ; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: length32:
+; X64:       # BB#0:
+; X64-NEXT:    movl $32, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind {
+; X32-LABEL: length32_eq:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $32
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
 ; X32-NEXT:    testl %eax, %eax
 ; X32-NEXT:    sete %al
 ; X32-NEXT:    retl
 ;
-; SSE2-LABEL: length32:
+; SSE2-LABEL: length32_eq:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pushq %rax
 ; SSE2-NEXT:    movl $32, %edx
@@ -259,7 +496,7 @@
 ; SSE2-NEXT:    popq %rcx
 ; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: length32:
+; AVX2-LABEL: length32_eq:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
@@ -273,8 +510,8 @@
   ret i1 %cmp
 }
 
-define i1 @length32_const(i8* %X, i32* nocapture %P) nounwind {
-; X32-LABEL: length32_const:
+define i1 @length32_eq_const(i8* %X) nounwind {
+; X32-LABEL: length32_eq_const:
 ; X32:       # BB#0:
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $32
@@ -286,7 +523,7 @@
 ; X32-NEXT:    setne %al
 ; X32-NEXT:    retl
 ;
-; SSE2-LABEL: length32_const:
+; SSE2-LABEL: length32_eq_const:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pushq %rax
 ; SSE2-NEXT:    movl $.L.str, %esi
@@ -297,7 +534,7 @@
 ; SSE2-NEXT:    popq %rcx
 ; SSE2-NEXT:    retq
 ;
-; AVX2-LABEL: length32_const:
+; AVX2-LABEL: length32_eq_const:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
@@ -311,7 +548,7 @@
   ret i1 %c
 }
 
-define i1 @length64(i8* %x, i8* %y) nounwind {
+define i32 @length64(i8* %X, i8* %Y) nounwind {
 ; X32-LABEL: length64:
 ; X32:       # BB#0:
 ; X32-NEXT:    pushl $0
@@ -320,11 +557,30 @@
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X32-NEXT:    calll memcmp
 ; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: length64:
+; X64:       # BB#0:
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind {
+; X32-LABEL: length64_eq:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $64
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
 ; X32-NEXT:    testl %eax, %eax
 ; X32-NEXT:    setne %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: length64:
+; X64-LABEL: length64_eq:
 ; X64:       # BB#0:
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    movl $64, %edx
@@ -338,8 +594,8 @@
   ret i1 %cmp
 }
 
-define i1 @length64_const(i8* %X, i32* nocapture %P) nounwind {
-; X32-LABEL: length64_const:
+define i1 @length64_eq_const(i8* %X) nounwind {
+; X32-LABEL: length64_eq_const:
 ; X32:       # BB#0:
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $64
@@ -351,7 +607,7 @@
 ; X32-NEXT:    sete %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: length64_const:
+; X64-LABEL: length64_eq_const:
 ; X64:       # BB#0:
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    movl $.L.str, %esi
Index: test/CodeGen/X86/mul-constant-i16.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i16.ll
+++ test/CodeGen/X86/mul-constant-i16.ll
@@ -188,13 +188,16 @@
 ; X86-LABEL: test_mul_by_11:
 ; X86:       # BB#0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $11, %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_11:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $11, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rdi,%rax,2), %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 11
@@ -225,13 +228,16 @@
 ; X86-LABEL: test_mul_by_13:
 ; X86:       # BB#0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $13, %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_13:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $13, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 13
@@ -241,14 +247,19 @@
 define i16 @test_mul_by_14(i16 %x) {
 ; X86-LABEL: test_mul_by_14:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $14, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %eax
+; X86-NEXT:    leal (%ecx,%eax,4), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_14:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $14, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 14
@@ -337,14 +348,19 @@
 define i16 @test_mul_by_19(i16 %x) {
 ; X86-LABEL: test_mul_by_19:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $19, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,4), %eax
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_19:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $19, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    shll $2, %eax
+; X64-NEXT:    subl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 19
@@ -375,13 +391,16 @@
 ; X86-LABEL: test_mul_by_21:
 ; X86:       # BB#0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $21, %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_21:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $21, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 21
@@ -391,14 +410,19 @@
 define i16 @test_mul_by_22(i16 %x) {
 ; X86-LABEL: test_mul_by_22:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $22, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,4), %eax
+; X86-NEXT:    leal (%ecx,%eax,4), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_22:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $22, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 22
@@ -408,14 +432,19 @@
 define i16 @test_mul_by_23(i16 %x) {
 ; X86-LABEL: test_mul_by_23:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $23, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %eax
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_23:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $23, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    shll $3, %eax
+; X64-NEXT:    subl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 23
@@ -465,14 +494,19 @@
 define i16 @test_mul_by_26(i16 %x) {
 ; X86-LABEL: test_mul_by_26:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $26, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_26:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $26, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    subl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 26
@@ -502,14 +536,19 @@
 define i16 @test_mul_by_28(i16 %x) {
 ; X86-LABEL: test_mul_by_28:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $28, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_28:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $28, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 28
@@ -519,14 +558,21 @@
 define i16 @test_mul_by_29(i16 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $29, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_29:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $29, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 29
@@ -536,14 +582,20 @@
 define i16 @test_mul_by_30(i16 %x) {
 ; X86-LABEL: test_mul_by_30:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $30, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shll $5, %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_30:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $30, %edi, %eax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $5, %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    subl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 30
@@ -587,3 +639,30 @@
   %mul = mul nsw i16 %x, 32
   ret i16 %mul
 }
+
+; (x*9+42)*(x*5+2)
+define i16 @test_mul_spec(i16 %x) nounwind {
+; X86-LABEL: test_mul_spec:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal 42(%eax,%eax,8), %ecx
+; X86-NEXT:    leal 2(%eax,%eax,4), %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_spec:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal 42(%rdi,%rdi,8), %ecx
+; X64-NEXT:    leal 2(%rdi,%rdi,4), %eax
+; X64-NEXT:    imull %ecx, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 9
+  %add = add nsw i16 %mul, 42
+  %mul2 = mul nsw i16 %x, 5
+  %add2 = add nsw i16 %mul2, 2
+  %mul3 = mul nsw i16 %add, %add2
+  ret i16 %mul3
+}
Index: test/CodeGen/X86/mul-constant-i32.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i32.ll
+++ test/CodeGen/X86/mul-constant-i32.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG
+; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT
 
 define i32 @test_mul_by_1(i32 %x) {
 ; X86-LABEL: test_mul_by_1:
@@ -8,10 +14,40 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_1:
-; X64:       # BB#0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_1:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_1:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_1:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_1:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_1:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_1:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_1:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 1
   ret i32 %mul
 }
@@ -23,11 +59,47 @@
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_2:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_2:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_2:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_2:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    addl %eax, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_2:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_2:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_2:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_2:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 2
   ret i32 %mul
 }
@@ -38,11 +110,46 @@
 ; X86-NEXT:    imull $3, {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_3:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_3:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_3:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_3:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $3, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_3:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_3:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_3:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_3:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 3
   ret i32 %mul
 }
@@ -54,11 +161,47 @@
 ; X86-NEXT:    shll $2, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_4:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (,%rdi,4), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_4:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_4:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_4:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    shll $2, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_4:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_4:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_4:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_4:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (,%rdi,4), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 4
   ret i32 %mul
 }
@@ -69,11 +212,46 @@
 ; X86-NEXT:    imull $5, {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_5:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_5:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_5:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_5:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $5, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_5:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_5:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_5:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_5:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 5
   ret i32 %mul
 }
@@ -86,12 +264,46 @@
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_6:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    addl %edi, %edi
-; X64-NEXT:    leal (%rdi,%rdi,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_6:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_6:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_6:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $6, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_6:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $6, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_6:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $6, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_6:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_6:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $6, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 6
   ret i32 %mul
 }
@@ -104,12 +316,46 @@
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_7:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (,%rdi,8), %eax
-; X64-NEXT:    subl %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_7:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_7:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_7:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $7, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_7:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $7, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_7:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $7, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_7:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (,%rdi,8), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_7:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $7, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 7
   ret i32 %mul
 }
@@ -121,11 +367,47 @@
 ; X86-NEXT:    shll $3, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_8:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (,%rdi,8), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_8:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_8:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_8:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    shll $3, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_8:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_8:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_8:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (,%rdi,8), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_8:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (,%rdi,8), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 8
   ret i32 %mul
 }
@@ -136,11 +418,46 @@
 ; X86-NEXT:    imull $9, {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_9:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_9:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_9:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_9:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $9, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_9:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_9:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_9:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_9:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 9
   ret i32 %mul
 }
@@ -153,12 +470,46 @@
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_10:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    addl %edi, %edi
-; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_10:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_10:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_10:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $10, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_10:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $10, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_10:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $10, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_10:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_10:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $10, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 10
   ret i32 %mul
 }
@@ -166,13 +517,49 @@
 define i32 @test_mul_by_11(i32 %x) {
 ; X86-LABEL: test_mul_by_11:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $11, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_11:
-; X64:       # BB#0:
-; X64-NEXT:    imull $11, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_11:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_11:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_11:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $11, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_11:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $11, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_11:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $11, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_11:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $11, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_11:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $11, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 11
   ret i32 %mul
 }
@@ -185,12 +572,46 @@
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_12:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    shll $2, %edi
-; X64-NEXT:    leal (%rdi,%rdi,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_12:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    shll $2, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_12:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    shll $2, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_12:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $12, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_12:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $12, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_12:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $12, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_12:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    shll $2, %edi # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_12:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $12, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 12
   ret i32 %mul
 }
@@ -198,13 +619,49 @@
 define i32 @test_mul_by_13(i32 %x) {
 ; X86-LABEL: test_mul_by_13:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $13, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_13:
-; X64:       # BB#0:
-; X64-NEXT:    imull $13, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_13:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_13:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_13:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $13, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_13:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $13, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_13:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $13, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_13:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $13, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_13:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $13, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 13
   ret i32 %mul
 }
@@ -212,13 +669,52 @@
 define i32 @test_mul_by_14(i32 %x) {
 ; X86-LABEL: test_mul_by_14:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $14, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %eax
+; X86-NEXT:    leal (%ecx,%eax,4), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_14:
-; X64:       # BB#0:
-; X64-NEXT:    imull $14, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_14:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_14:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_14:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $14, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_14:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $14, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_14:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $14, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_14:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $14, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_14:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $14, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 14
   ret i32 %mul
 }
@@ -231,12 +727,46 @@
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_15:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    leal (%rax,%rax,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_15:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_15:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_15:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $15, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_15:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $15, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_15:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $15, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_15:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_15:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $15, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 15
   ret i32 %mul
 }
@@ -248,11 +778,47 @@
 ; X86-NEXT:    shll $4, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_16:
-; X64:       # BB#0:
-; X64-NEXT:    shll $4, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_16:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shll $4, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_16:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shll $4, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_16:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    shll $4, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_16:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    shll $4, %edi # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_16:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    shll $4, %edi # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_16:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shll $4, %edi # sched: [1:1.00]
+; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_16:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    shll $4, %edi # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 16
   ret i32 %mul
 }
@@ -266,13 +832,49 @@
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_17:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $4, %eax
-; X64-NEXT:    leal (%rax,%rdi), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_17:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    shll $4, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rdi), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_17:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    shll $4, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rdi), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_17:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $17, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_17:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $17, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_17:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $17, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_17:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    shll $4, %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rax,%rdi), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_17:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $17, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 17
   ret i32 %mul
 }
@@ -285,12 +887,46 @@
 ; X86-NEXT:    leal (%eax,%eax,8), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_18:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    addl %edi, %edi
-; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_18:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_18:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_18:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $18, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_18:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $18, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_18:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $18, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_18:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_18:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $18, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 18
   ret i32 %mul
 }
@@ -298,13 +934,52 @@
 define i32 @test_mul_by_19(i32 %x) {
 ; X86-LABEL: test_mul_by_19:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $19, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,4), %eax
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_19:
-; X64:       # BB#0:
-; X64-NEXT:    imull $19, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_19:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $2, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_19:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    shll $2, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_19:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $19, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_19:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $19, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_19:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $19, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_19:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $19, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_19:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $19, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 19
   ret i32 %mul
 }
@@ -317,12 +992,46 @@
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_20:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    shll $2, %edi
-; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_20:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    shll $2, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_20:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    shll $2, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_20:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $20, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_20:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $20, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_20:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $20, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_20:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    shll $2, %edi # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_20:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $20, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 20
   ret i32 %mul
 }
@@ -330,13 +1039,49 @@
 define i32 @test_mul_by_21(i32 %x) {
 ; X86-LABEL: test_mul_by_21:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $21, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_21:
-; X64:       # BB#0:
-; X64-NEXT:    imull $21, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_21:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_21:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_21:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $21, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_21:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $21, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_21:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $21, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_21:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $21, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_21:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $21, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 21
   ret i32 %mul
 }
@@ -344,13 +1089,52 @@
 define i32 @test_mul_by_22(i32 %x) {
 ; X86-LABEL: test_mul_by_22:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $22, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,4), %eax
+; X86-NEXT:    leal (%ecx,%eax,4), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_22:
-; X64:       # BB#0:
-; X64-NEXT:    imull $22, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_22:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_22:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_22:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $22, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_22:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $22, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_22:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $22, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_22:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $22, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_22:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $22, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 22
   ret i32 %mul
 }
@@ -358,13 +1142,52 @@
 define i32 @test_mul_by_23(i32 %x) {
 ; X86-LABEL: test_mul_by_23:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $23, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %eax
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_23:
-; X64:       # BB#0:
-; X64-NEXT:    imull $23, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_23:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $3, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_23:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    shll $3, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_23:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $23, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_23:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $23, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_23:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $23, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_23:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $23, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_23:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $23, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 23
   ret i32 %mul
 }
@@ -377,12 +1200,46 @@
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_24:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    shll $3, %edi
-; X64-NEXT:    leal (%rdi,%rdi,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_24:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    shll $3, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_24:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    shll $3, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_24:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $24, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_24:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $24, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_24:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $24, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_24:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    shll $3, %edi # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_24:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $24, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 24
   ret i32 %mul
 }
@@ -395,12 +1252,46 @@
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_25:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    leal (%rax,%rax,4), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_25:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_25:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_25:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $25, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_25:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $25, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_25:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $25, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_25:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rax,%rax,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_25:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $25, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 25
   ret i32 %mul
 }
@@ -408,13 +1299,52 @@
 define i32 @test_mul_by_26(i32 %x) {
 ; X86-LABEL: test_mul_by_26:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $26, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_26:
-; X64:       # BB#0:
-; X64-NEXT:    imull $26, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_26:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_26:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_26:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $26, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_26:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $26, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_26:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $26, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_26:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $26, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_26:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $26, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 26
   ret i32 %mul
 }
@@ -427,12 +1357,46 @@
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_27:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    leal (%rax,%rax,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_27:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_27:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_27:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $27, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_27:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $27, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_27:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $27, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_27:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_27:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $27, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 27
   ret i32 %mul
 }
@@ -440,13 +1404,52 @@
 define i32 @test_mul_by_28(i32 %x) {
 ; X86-LABEL: test_mul_by_28:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $28, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_28:
-; X64:       # BB#0:
-; X64-NEXT:    imull $28, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_28:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_28:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_28:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $28, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_28:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $28, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_28:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $28, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_28:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $28, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_28:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $28, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 28
   ret i32 %mul
 }
@@ -454,13 +1457,55 @@
 define i32 @test_mul_by_29(i32 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $29, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_29:
-; X64:       # BB#0:
-; X64-NEXT:    imull $29, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_29:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_29:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_29:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $29, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_29:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $29, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_29:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $29, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_29:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $29, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_29:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $29, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 29
   ret i32 %mul
 }
@@ -468,13 +1513,53 @@
 define i32 @test_mul_by_30(i32 %x) {
 ; X86-LABEL: test_mul_by_30:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $30, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shll $5, %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_30:
-; X64:       # BB#0:
-; X64-NEXT:    imull $30, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_30:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    shll $5, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_30:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    shll $5, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_30:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $30, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_30:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $30, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_30:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $30, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_30:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $30, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_30:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $30, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 30
   ret i32 %mul
 }
@@ -488,12 +1573,46 @@
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_31:
-; X64:       # BB#0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $5, %eax
-; X64-NEXT:    subl %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_31:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    shll $5, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_31:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    shll $5, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_31:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $31, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_31:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $31, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_31:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $31, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_31:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    shll $5, %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_31:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $31, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 31
   ret i32 %mul
 }
@@ -505,11 +1624,124 @@
 ; X86-NEXT:    shll $5, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_32:
-; X64:       # BB#0:
-; X64-NEXT:    shll $5, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_32:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shll $5, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_32:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shll $5, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_32:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    shll $5, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_32:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    shll $5, %edi # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_32:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    shll $5, %edi # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_32:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shll $5, %edi # sched: [1:1.00]
+; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_32:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    shll $5, %edi # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 32
   ret i32 %mul
 }
+
+; (x*9+42)*(x*5+2)
+define i32 @test_mul_spec(i32 %x) nounwind {
+; X86-LABEL: test_mul_spec:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal 42(%eax,%eax,8), %ecx
+; X86-NEXT:    leal 2(%eax,%eax,4), %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-HSW-LABEL: test_mul_spec:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %ecx # sched: [1:0.50]
+; X64-HSW-NEXT:    addl $42, %ecx # sched: [1:0.25]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    addl $2, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    imull %ecx, %eax # sched: [4:1.00]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_spec:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50]
+; X64-JAG-NEXT:    leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    imull %ecx, %eax # sched: [3:1.00]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_spec:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    leal 42(%eax,%eax,8), %ecx
+; X86-NOOPT-NEXT:    leal 2(%eax,%eax,4), %eax
+; X86-NOOPT-NEXT:    imull %ecx, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_spec:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,8), %ecx # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    addl $42, %ecx # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    addl $2, %eax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    imull %ecx, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_spec:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    imull %ecx, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_spec:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00]
+; X64-SLM-NEXT:    leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    imull %ecx, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_spec:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    imull %ecx, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
+  %mul = mul nsw i32 %x, 9
+  %add = add nsw i32 %mul, 42
+  %mul2 = mul nsw i32 %x, 5
+  %add2 = add nsw i32 %mul2, 2
+  %mul3 = mul nsw i32 %add, %add2
+  ret i32 %mul3
+}
Index: test/CodeGen/X86/mul-constant-i64.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i64.ll
+++ test/CodeGen/X86/mul-constant-i64.ll
@@ -1,18 +1,55 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG
+; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT
 
-define i64 @test_mul_by_1(i64 %x) {
+define i64 @test_mul_by_1(i64 %x) nounwind {
 ; X86-LABEL: test_mul_by_1:
 ; X86:       # BB#0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_1:
-; X64:       # BB#0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_1:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_1:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_1:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_1:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_1:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_1:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_1:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 1
   ret i64 %mul
 }
@@ -26,10 +63,43 @@
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_2:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_2:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_2:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_2:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    shldl $1, %eax, %edx
+; X86-NOOPT-NEXT:    addl %eax, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_2:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_2:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_2:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_2:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 2
   ret i64 %mul
 }
@@ -43,10 +113,43 @@
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_3:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_3:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_3:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_3:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $3, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $3, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_3:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_3:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_3:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_3:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 3
   ret i64 %mul
 }
@@ -60,10 +163,43 @@
 ; X86-NEXT:    shll $2, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_4:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (,%rdi,4), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_4:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_4:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_4:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    shldl $2, %eax, %edx
+; X86-NOOPT-NEXT:    shll $2, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_4:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_4:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_4:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_4:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (,%rdi,4), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 4
   ret i64 %mul
 }
@@ -77,10 +213,43 @@
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_5:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_5:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_5:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_5:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $5, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $5, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_5:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_5:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_5:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_5:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 5
   ret i64 %mul
 }
@@ -95,11 +264,46 @@
 ; X86-NEXT:    leal (%edx,%ecx,2), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_6:
-; X64:       # BB#0:
-; X64-NEXT:    addq %rdi, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_6:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_6:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_6:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $6, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $6, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_6:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $6, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_6:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $6, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_6:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_6:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $6, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 6
   ret i64 %mul
 }
@@ -115,11 +319,46 @@
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_7:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (,%rdi,8), %rax
-; X64-NEXT:    subq %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_7:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_7:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_7:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $7, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $7, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_7:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $7, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_7:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $7, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_7:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (,%rdi,8), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_7:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $7, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 7
   ret i64 %mul
 }
@@ -133,10 +372,43 @@
 ; X86-NEXT:    shll $3, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_8:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (,%rdi,8), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_8:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_8:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_8:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    shldl $3, %eax, %edx
+; X86-NOOPT-NEXT:    shll $3, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_8:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_8:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_8:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (,%rdi,8), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_8:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (,%rdi,8), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 8
   ret i64 %mul
 }
@@ -150,10 +422,43 @@
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_9:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,8), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_9:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_9:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_9:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $9, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $9, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_9:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_9:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_9:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_9:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 9
   ret i64 %mul
 }
@@ -168,11 +473,46 @@
 ; X86-NEXT:    leal (%edx,%ecx,2), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_10:
-; X64:       # BB#0:
-; X64-NEXT:    addq %rdi, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_10:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_10:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_10:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $10, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $10, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_10:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $10, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_10:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $10, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_10:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_10:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $10, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 10
   ret i64 %mul
 }
@@ -180,16 +520,53 @@
 define i64 @test_mul_by_11(i64 %x) {
 ; X86-LABEL: test_mul_by_11:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,2), %ecx
 ; X86-NEXT:    movl $11, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $11, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_11:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $11, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_11:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_11:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_11:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $11, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $11, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_11:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $11, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_11:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $11, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_11:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $11, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_11:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $11, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 11
   ret i64 %mul
 }
@@ -204,11 +581,46 @@
 ; X86-NEXT:    leal (%edx,%ecx,4), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_12:
-; X64:       # BB#0:
-; X64-NEXT:    shlq $2, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_12:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shlq $2, %rdi # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_12:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shlq $2, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_12:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $12, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $12, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_12:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $12, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_12:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $12, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_12:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shlq $2, %rdi # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_12:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $12, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 12
   ret i64 %mul
 }
@@ -216,16 +628,53 @@
 define i64 @test_mul_by_13(i64 %x) {
 ; X86-LABEL: test_mul_by_13:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
 ; X86-NEXT:    movl $13, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $13, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_13:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $13, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_13:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_13:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_13:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $13, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $13, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_13:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $13, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_13:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $13, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_13:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $13, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_13:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $13, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 13
   ret i64 %mul
 }
@@ -233,16 +682,56 @@
 define i64 @test_mul_by_14(i64 %x) {
 ; X86-LABEL: test_mul_by_14:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl $14, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $14, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_14:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $14, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_14:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_14:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_14:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $14, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $14, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_14:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $14, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_14:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $14, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_14:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $14, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_14:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $14, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 14
   ret i64 %mul
 }
@@ -258,11 +747,46 @@
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_15:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
-; X64-NEXT:    leaq (%rax,%rax,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_15:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_15:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_15:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $15, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $15, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_15:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $15, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_15:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $15, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_15:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_15:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $15, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 15
   ret i64 %mul
 }
@@ -276,11 +800,49 @@
 ; X86-NEXT:    shll $4, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_16:
-; X64:       # BB#0:
-; X64-NEXT:    shlq $4, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_16:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shlq $4, %rdi # sched: [1:0.50]
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_16:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shlq $4, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_16:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    shldl $4, %eax, %edx
+; X86-NOOPT-NEXT:    shll $4, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_16:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    shlq $4, %rdi # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_16:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    shlq $4, %rdi # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_16:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shlq $4, %rdi # sched: [1:1.00]
+; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_16:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    shlq $4, %rdi # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 16
   ret i64 %mul
 }
@@ -297,12 +859,49 @@
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_17:
-; X64:       # BB#0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    leaq (%rax,%rdi), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_17:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    shlq $4, %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rdi), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_17:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    shlq $4, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rdi), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_17:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $17, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $17, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_17:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $17, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_17:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $17, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_17:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    shlq $4, %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_17:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $17, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 17
   ret i64 %mul
 }
@@ -317,11 +916,46 @@
 ; X86-NEXT:    leal (%edx,%ecx,2), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_18:
-; X64:       # BB#0:
-; X64-NEXT:    addq %rdi, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,8), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_18:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_18:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_18:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $18, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $18, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_18:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $18, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_18:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $18, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_18:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_18:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $18, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 18
   ret i64 %mul
 }
@@ -329,16 +963,56 @@
 define i64 @test_mul_by_19(i64 %x) {
 ; X86-LABEL: test_mul_by_19:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    shll $2, %ecx
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl $19, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $19, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_19:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $19, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_19:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    shlq $2, %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_19:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    shlq $2, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_19:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $19, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $19, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_19:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $19, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_19:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $19, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_19:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $19, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_19:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $19, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 19
   ret i64 %mul
 }
@@ -353,11 +1027,46 @@
 ; X86-NEXT:    leal (%edx,%ecx,4), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_20:
-; X64:       # BB#0:
-; X64-NEXT:    shlq $2, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_20:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shlq $2, %rdi # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_20:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shlq $2, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_20:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $20, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $20, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_20:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $20, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_20:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $20, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_20:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shlq $2, %rdi # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_20:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $20, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 20
   ret i64 %mul
 }
@@ -365,16 +1074,53 @@
 define i64 @test_mul_by_21(i64 %x) {
 ; X86-LABEL: test_mul_by_21:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
 ; X86-NEXT:    movl $21, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $21, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_21:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $21, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_21:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_21:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_21:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $21, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $21, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_21:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $21, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_21:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $21, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_21:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $21, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_21:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $21, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 21
   ret i64 %mul
 }
@@ -382,16 +1128,56 @@
 define i64 @test_mul_by_22(i64 %x) {
 ; X86-LABEL: test_mul_by_22:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl $22, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $22, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_22:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $22, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_22:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_22:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_22:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $22, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $22, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_22:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $22, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_22:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $22, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_22:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $22, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_22:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $22, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 22
   ret i64 %mul
 }
@@ -399,16 +1185,56 @@
 define i64 @test_mul_by_23(i64 %x) {
 ; X86-LABEL: test_mul_by_23:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    shll $3, %ecx
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl $23, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $23, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_23:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $23, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_23:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    shlq $3, %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_23:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    shlq $3, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_23:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $23, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $23, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_23:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $23, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_23:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $23, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_23:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $23, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_23:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $23, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 23
   ret i64 %mul
 }
@@ -423,11 +1249,46 @@
 ; X86-NEXT:    leal (%edx,%ecx,8), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_24:
-; X64:       # BB#0:
-; X64-NEXT:    shlq $3, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_24:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shlq $3, %rdi # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_24:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shlq $3, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_24:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $24, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $24, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_24:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $24, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_24:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $24, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_24:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shlq $3, %rdi # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_24:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $24, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 24
   ret i64 %mul
 }
@@ -443,11 +1304,46 @@
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_25:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
-; X64-NEXT:    leaq (%rax,%rax,4), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_25:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_25:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_25:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $25, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $25, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_25:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $25, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_25:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $25, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_25:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rax,%rax,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_25:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $25, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 25
   ret i64 %mul
 }
@@ -455,16 +1351,56 @@
 define i64 @test_mul_by_26(i64 %x) {
 ; X86-LABEL: test_mul_by_26:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl $26, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $26, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_26:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $26, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_26:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_26:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_26:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $26, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $26, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_26:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $26, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_26:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $26, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_26:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $26, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_26:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $26, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 26
   ret i64 %mul
 }
@@ -480,11 +1416,46 @@
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_27:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,8), %rax
-; X64-NEXT:    leaq (%rax,%rax,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_27:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_27:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_27:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $27, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $27, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_27:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $27, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_27:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $27, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_27:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_27:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $27, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 27
   ret i64 %mul
 }
@@ -492,16 +1463,56 @@
 define i64 @test_mul_by_28(i64 %x) {
 ; X86-LABEL: test_mul_by_28:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl $28, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $28, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_28:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $28, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_28:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_28:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_28:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $28, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $28, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_28:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $28, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_28:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $28, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_28:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $28, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_28:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $28, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 28
   ret i64 %mul
 }
@@ -509,16 +1520,59 @@
 define i64 @test_mul_by_29(i64 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl $29, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $29, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_29:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $29, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_29:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_29:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_29:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $29, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $29, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_29:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $29, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_29:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $29, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_29:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $29, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_29:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $29, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 29
   ret i64 %mul
 }
@@ -526,16 +1580,59 @@
 define i64 @test_mul_by_30(i64 %x) {
 ; X86-LABEL: test_mul_by_30:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $5, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl $30, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $30, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_30:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $30, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_30:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    shlq $5, %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_30:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    shlq $5, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_30:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $30, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $30, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_30:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $30, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_30:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $30, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_30:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $30, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_30:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $30, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 30
   ret i64 %mul
 }
@@ -552,12 +1649,49 @@
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_31:
-; X64:       # BB#0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlq $5, %rax
-; X64-NEXT:    subq %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_31:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    shlq $5, %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_31:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    shlq $5, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_31:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $31, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $31, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_31:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $31, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_31:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $31, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_31:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    shlq $5, %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_31:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $31, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 31
   ret i64 %mul
 }
@@ -571,11 +1705,168 @@
 ; X86-NEXT:    shll $5, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_32:
-; X64:       # BB#0:
-; X64-NEXT:    shlq $5, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_32:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shlq $5, %rdi # sched: [1:0.50]
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_32:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shlq $5, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_32:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    shldl $5, %eax, %edx
+; X86-NOOPT-NEXT:    shll $5, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_32:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    shlq $5, %rdi # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_32:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    shlq $5, %rdi # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_32:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shlq $5, %rdi # sched: [1:1.00]
+; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_32:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    shlq $5, %rdi # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 32
   ret i64 %mul
 }
+
+; (x*9+42)*(x*5+2)
+define i64 @test_mul_spec(i64 %x) nounwind {
+; X86-LABEL: test_mul_spec:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl $9, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    leal (%edi,%edi,8), %ebx
+; X86-NEXT:    addl $42, %esi
+; X86-NEXT:    adcl %edx, %ebx
+; X86-NEXT:    movl $5, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%edi,%edi,4), %edi
+; X86-NEXT:    addl $2, %ecx
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-HSW-LABEL: test_mul_spec:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50]
+; X64-HSW-NEXT:    addq $42, %rcx # sched: [1:0.25]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    addq $2, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_spec:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_spec:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    pushl %ebx
+; X86-NOOPT-NEXT:    pushl %edi
+; X86-NOOPT-NEXT:    pushl %esi
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOOPT-NEXT:    movl $9, %edx
+; X86-NOOPT-NEXT:    movl %ecx, %eax
+; X86-NOOPT-NEXT:    mull %edx
+; X86-NOOPT-NEXT:    movl %eax, %esi
+; X86-NOOPT-NEXT:    leal (%edi,%edi,8), %ebx
+; X86-NOOPT-NEXT:    addl $42, %esi
+; X86-NOOPT-NEXT:    adcl %edx, %ebx
+; X86-NOOPT-NEXT:    movl $5, %edx
+; X86-NOOPT-NEXT:    movl %ecx, %eax
+; X86-NOOPT-NEXT:    mull %edx
+; X86-NOOPT-NEXT:    movl %eax, %ecx
+; X86-NOOPT-NEXT:    leal (%edi,%edi,4), %edi
+; X86-NOOPT-NEXT:    addl $2, %ecx
+; X86-NOOPT-NEXT:    adcl %edx, %edi
+; X86-NOOPT-NEXT:    movl %esi, %eax
+; X86-NOOPT-NEXT:    mull %ecx
+; X86-NOOPT-NEXT:    imull %esi, %edi
+; X86-NOOPT-NEXT:    addl %edi, %edx
+; X86-NOOPT-NEXT:    imull %ebx, %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    popl %esi
+; X86-NOOPT-NEXT:    popl %edi
+; X86-NOOPT-NEXT:    popl %ebx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_spec:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    addq $42, %rcx # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    addq $2, %rax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_spec:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_spec:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_spec:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
+  %mul = mul nsw i64 %x, 9
+  %add = add nsw i64 %mul, 42
+  %mul2 = mul nsw i64 %x, 5
+  %add2 = add nsw i64 %mul2, 2
+  %mul3 = mul nsw i64 %add, %add2
+  ret i64 %mul3
+}
Index: test/CodeGen/X86/mul-constant-result.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/mul-constant-result.ll
@@ -0,0 +1,1291 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @mult(i32, i32) local_unnamed_addr #0 {
+; X86-LABEL: mult:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:  .Lcfi0:
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .Lcfi1:
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    jg .LBB0_2
+; X86-NEXT:  # BB#1:
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    je .LBB0_4
+; X86-NEXT:  # BB#3:
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:  .LBB0_4:
+; X86-NEXT:    decl %ecx
+; X86-NEXT:    cmpl $31, %ecx
+; X86-NEXT:    ja .LBB0_39
+; X86-NEXT:  # BB#5:
+; X86-NEXT:    jmpl *.LJTI0_0(,%ecx,4)
+; X86-NEXT:  .LBB0_6:
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_39:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB0_40:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_7:
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_8:
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_9:
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_10:
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_11:
+; X86-NEXT:    leal (,%eax,8), %ecx
+; X86-NEXT:    jmp .LBB0_12
+; X86-NEXT:  .LBB0_13:
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_14:
+; X86-NEXT:    leal (%eax,%eax,8), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_15:
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_16:
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_17:
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_18:
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_19:
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    jmp .LBB0_20
+; X86-NEXT:  .LBB0_21:
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_22:
+; X86-NEXT:    shll $4, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_23:
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_24:
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,8), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_25:
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    shll $2, %ecx
+; X86-NEXT:    jmp .LBB0_12
+; X86-NEXT:  .LBB0_26:
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_27:
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_28:
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:  .LBB0_20:
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_29:
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    shll $3, %ecx
+; X86-NEXT:    jmp .LBB0_12
+; X86-NEXT:  .LBB0_30:
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_31:
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_32:
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    jmp .LBB0_12
+; X86-NEXT:  .LBB0_33:
+; X86-NEXT:    leal (%eax,%eax,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_34:
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_35:
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_36:
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $5, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    jmp .LBB0_12
+; X86-NEXT:  .LBB0_37:
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $5, %ecx
+; X86-NEXT:  .LBB0_12:
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_38:
+; X86-NEXT:    shll $5, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-HSW-LABEL: mult:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    cmpl $1, %esi
+; X64-HSW-NEXT:    movl $1, %ecx
+; X64-HSW-NEXT:    movl %esi, %eax
+; X64-HSW-NEXT:    cmovgl %ecx, %eax
+; X64-HSW-NEXT:    testl %esi, %esi
+; X64-HSW-NEXT:    cmovel %ecx, %eax
+; X64-HSW-NEXT:    addl $-1, %edi
+; X64-HSW-NEXT:    cmpl $31, %edi
+; X64-HSW-NEXT:    ja .LBB0_36
+; X64-HSW-NEXT:  # BB#1:
+; X64-HSW-NEXT:    jmpq *.LJTI0_0(,%rdi,8)
+; X64-HSW-NEXT:  .LBB0_2:
+; X64-HSW-NEXT:    addl %eax, %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_36:
+; X64-HSW-NEXT:    xorl %eax, %eax
+; X64-HSW-NEXT:  .LBB0_37:
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_3:
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_4:
+; X64-HSW-NEXT:    shll $2, %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_5:
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_6:
+; X64-HSW-NEXT:    addl %eax, %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_7:
+; X64-HSW-NEXT:    leal (,%rax,8), %ecx
+; X64-HSW-NEXT:    jmp .LBB0_8
+; X64-HSW-NEXT:  .LBB0_9:
+; X64-HSW-NEXT:    shll $3, %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_10:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_11:
+; X64-HSW-NEXT:    addl %eax, %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_12:
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
+; X64-HSW-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_13:
+; X64-HSW-NEXT:    shll $2, %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_14:
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %ecx
+; X64-HSW-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_15:
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %ecx
+; X64-HSW-NEXT:    jmp .LBB0_16
+; X64-HSW-NEXT:  .LBB0_18:
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_19:
+; X64-HSW-NEXT:    shll $4, %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_20:
+; X64-HSW-NEXT:    movl %eax, %ecx
+; X64-HSW-NEXT:    shll $4, %ecx
+; X64-HSW-NEXT:    jmp .LBB0_17
+; X64-HSW-NEXT:  .LBB0_21:
+; X64-HSW-NEXT:    addl %eax, %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_22:
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
+; X64-HSW-NEXT:    shll $2, %ecx
+; X64-HSW-NEXT:    jmp .LBB0_8
+; X64-HSW-NEXT:  .LBB0_23:
+; X64-HSW-NEXT:    shll $2, %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_24:
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
+; X64-HSW-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_25:
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
+; X64-HSW-NEXT:  .LBB0_16:
+; X64-HSW-NEXT:    leal (%rax,%rcx,4), %ecx
+; X64-HSW-NEXT:    jmp .LBB0_17
+; X64-HSW-NEXT:  .LBB0_26:
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %ecx
+; X64-HSW-NEXT:    shll $3, %ecx
+; X64-HSW-NEXT:    jmp .LBB0_8
+; X64-HSW-NEXT:  .LBB0_27:
+; X64-HSW-NEXT:    shll $3, %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_28:
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_29:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
+; X64-HSW-NEXT:    jmp .LBB0_8
+; X64-HSW-NEXT:  .LBB0_30:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_31:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
+; X64-HSW-NEXT:    jmp .LBB0_17
+; X64-HSW-NEXT:  .LBB0_32:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
+; X64-HSW-NEXT:    addl %eax, %ecx
+; X64-HSW-NEXT:  .LBB0_17:
+; X64-HSW-NEXT:    addl %eax, %ecx
+; X64-HSW-NEXT:    movl %ecx, %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_33:
+; X64-HSW-NEXT:    movl %eax, %ecx
+; X64-HSW-NEXT:    shll $5, %ecx
+; X64-HSW-NEXT:    subl %eax, %ecx
+; X64-HSW-NEXT:    jmp .LBB0_8
+; X64-HSW-NEXT:  .LBB0_34:
+; X64-HSW-NEXT:    movl %eax, %ecx
+; X64-HSW-NEXT:    shll $5, %ecx
+; X64-HSW-NEXT:  .LBB0_8:
+; X64-HSW-NEXT:    subl %eax, %ecx
+; X64-HSW-NEXT:    movl %ecx, %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_35:
+; X64-HSW-NEXT:    shll $5, %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
+  %3 = icmp eq i32 %1, 0
+  %4 = icmp sgt i32 %1, 1
+  %5 = or i1 %3, %4
+  %6 = select i1 %5, i32 1, i32 %1
+  switch i32 %0, label %69 [
+    i32 1, label %70
+    i32 2, label %7
+    i32 3, label %9
+    i32 4, label %11
+    i32 5, label %13
+    i32 6, label %15
+    i32 7, label %17
+    i32 8, label %19
+    i32 9, label %21
+    i32 10, label %23
+    i32 11, label %25
+    i32 12, label %27
+    i32 13, label %29
+    i32 14, label %31
+    i32 15, label %33
+    i32 16, label %35
+    i32 17, label %37
+    i32 18, label %39
+    i32 19, label %41
+    i32 20, label %43
+    i32 21, label %45
+    i32 22, label %47
+    i32 23, label %49
+    i32 24, label %51
+    i32 25, label %53
+    i32 26, label %55
+    i32 27, label %57
+    i32 28, label %59
+    i32 29, label %61
+    i32 30, label %63
+    i32 31, label %65
+    i32 32, label %67
+  ]
+
+; <label>:7:                                      ; preds = %2
+  %8 = shl nsw i32 %6, 1
+  br label %70
+
+; <label>:9:                                      ; preds = %2
+  %10 = mul nsw i32 %6, 3
+  br label %70
+
+; <label>:11:                                     ; preds = %2
+  %12 = shl nsw i32 %6, 2
+  br label %70
+
+; <label>:13:                                     ; preds = %2
+  %14 = mul nsw i32 %6, 5
+  br label %70
+
+; <label>:15:                                     ; preds = %2
+  %16 = mul nsw i32 %6, 6
+  br label %70
+
+; <label>:17:                                     ; preds = %2
+  %18 = mul nsw i32 %6, 7
+  br label %70
+
+; <label>:19:                                     ; preds = %2
+  %20 = shl nsw i32 %6, 3
+  br label %70
+
+; <label>:21:                                     ; preds = %2
+  %22 = mul nsw i32 %6, 9
+  br label %70
+
+; <label>:23:                                     ; preds = %2
+  %24 = mul nsw i32 %6, 10
+  br label %70
+
+; <label>:25:                                     ; preds = %2
+  %26 = mul nsw i32 %6, 11
+  br label %70
+
+; <label>:27:                                     ; preds = %2
+  %28 = mul nsw i32 %6, 12
+  br label %70
+
+; <label>:29:                                     ; preds = %2
+  %30 = mul nsw i32 %6, 13
+  br label %70
+
+; <label>:31:                                     ; preds = %2
+  %32 = mul nsw i32 %6, 14
+  br label %70
+
+; <label>:33:                                     ; preds = %2
+  %34 = mul nsw i32 %6, 15
+  br label %70
+
+; <label>:35:                                     ; preds = %2
+  %36 = shl nsw i32 %6, 4
+  br label %70
+
+; <label>:37:                                     ; preds = %2
+  %38 = mul nsw i32 %6, 17
+  br label %70
+
+; <label>:39:                                     ; preds = %2
+  %40 = mul nsw i32 %6, 18
+  br label %70
+
+; <label>:41:                                     ; preds = %2
+  %42 = mul nsw i32 %6, 19
+  br label %70
+
+; <label>:43:                                     ; preds = %2
+  %44 = mul nsw i32 %6, 20
+  br label %70
+
+; <label>:45:                                     ; preds = %2
+  %46 = mul nsw i32 %6, 21
+  br label %70
+
+; <label>:47:                                     ; preds = %2
+  %48 = mul nsw i32 %6, 22
+  br label %70
+
+; <label>:49:                                     ; preds = %2
+  %50 = mul nsw i32 %6, 23
+  br label %70
+
+; <label>:51:                                     ; preds = %2
+  %52 = mul nsw i32 %6, 24
+  br label %70
+
+; <label>:53:                                     ; preds = %2
+  %54 = mul nsw i32 %6, 25
+  br label %70
+
+; <label>:55:                                     ; preds = %2
+  %56 = mul nsw i32 %6, 26
+  br label %70
+
+; <label>:57:                                     ; preds = %2
+  %58 = mul nsw i32 %6, 27
+  br label %70
+
+; <label>:59:                                     ; preds = %2
+  %60 = mul nsw i32 %6, 28
+  br label %70
+
+; <label>:61:                                     ; preds = %2
+  %62 = mul nsw i32 %6, 29
+  br label %70
+
+; <label>:63:                                     ; preds = %2
+  %64 = mul nsw i32 %6, 30
+  br label %70
+
+; <label>:65:                                     ; preds = %2
+  %66 = mul nsw i32 %6, 31
+  br label %70
+
+; <label>:67:                                     ; preds = %2
+  %68 = shl nsw i32 %6, 5
+  br label %70
+
+; <label>:69:                                     ; preds = %2
+  br label %70
+
+; <label>:70:                                     ; preds = %2, %69, %67, %65, %63, %61, %59, %57, %55, %53, %51, %49, %47, %45, %43, %41, %39, %37, %35, %33, %31, %29, %27, %25, %23, %21, %19, %17, %15, %13, %11, %9, %7
+  %71 = phi i32 [ %8, %7 ], [ %10, %9 ], [ %12, %11 ], [ %14, %13 ], [ %16, %15 ], [ %18, %17 ], [ %20, %19 ], [ %22, %21 ], [ %24, %23 ], [ %26, %25 ], [ %28, %27 ], [ %30, %29 ], [ %32, %31 ], [ %34, %33 ], [ %36, %35 ], [ %38, %37 ], [ %40, %39 ], [ %42, %41 ], [ %44, %43 ], [ %46, %45 ], [ %48, %47 ], [ %50, %49 ], [ %52, %51 ], [ %54, %53 ], [ %56, %55 ], [ %58, %57 ], [ %60, %59 ], [ %62, %61 ], [ %64, %63 ], [ %66, %65 ], [ %68, %67 ], [ 0, %69 ], [ %6, %2 ]
+  ret i32 %71
+}
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @foo() local_unnamed_addr #0 {
+; X86-LABEL: foo:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:  .Lcfi2:
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %edi
+; X86-NEXT:  .Lcfi3:
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    pushl %esi
+; X86-NEXT:  .Lcfi4:
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:  .Lcfi5:
+; X86-NEXT:    .cfi_offset %esi, -16
+; X86-NEXT:  .Lcfi6:
+; X86-NEXT:    .cfi_offset %edi, -12
+; X86-NEXT:  .Lcfi7:
+; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    pushl $0
+; X86-NEXT:  .Lcfi8:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $1
+; X86-NEXT:  .Lcfi9:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi10:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl $1, %esi
+; X86-NEXT:    pushl $1
+; X86-NEXT:  .Lcfi11:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $2
+; X86-NEXT:  .Lcfi12:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi13:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $2, %edi
+; X86-NEXT:    pushl $1
+; X86-NEXT:  .Lcfi14:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $3
+; X86-NEXT:  .Lcfi15:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi16:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $3, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $2
+; X86-NEXT:  .Lcfi17:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $4
+; X86-NEXT:  .Lcfi18:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi19:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $4, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $2
+; X86-NEXT:  .Lcfi20:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $5
+; X86-NEXT:  .Lcfi21:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi22:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $5, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $3
+; X86-NEXT:  .Lcfi23:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $6
+; X86-NEXT:  .Lcfi24:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi25:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $6, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $3
+; X86-NEXT:  .Lcfi26:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $7
+; X86-NEXT:  .Lcfi27:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi28:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $7, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $4
+; X86-NEXT:  .Lcfi29:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $8
+; X86-NEXT:  .Lcfi30:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi31:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $8, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $4
+; X86-NEXT:  .Lcfi32:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $9
+; X86-NEXT:  .Lcfi33:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi34:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $9, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $5
+; X86-NEXT:  .Lcfi35:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $10
+; X86-NEXT:  .Lcfi36:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi37:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $10, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $5
+; X86-NEXT:  .Lcfi38:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $11
+; X86-NEXT:  .Lcfi39:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi40:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $11, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $6
+; X86-NEXT:  .Lcfi41:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $12
+; X86-NEXT:  .Lcfi42:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi43:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $12, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $6
+; X86-NEXT:  .Lcfi44:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $13
+; X86-NEXT:  .Lcfi45:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi46:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $13, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $7
+; X86-NEXT:  .Lcfi47:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $14
+; X86-NEXT:  .Lcfi48:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi49:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $14, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $7
+; X86-NEXT:  .Lcfi50:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $15
+; X86-NEXT:  .Lcfi51:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi52:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $15, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $8
+; X86-NEXT:  .Lcfi53:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $16
+; X86-NEXT:  .Lcfi54:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi55:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $16, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $8
+; X86-NEXT:  .Lcfi56:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $17
+; X86-NEXT:  .Lcfi57:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi58:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $17, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $9
+; X86-NEXT:  .Lcfi59:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $18
+; X86-NEXT:  .Lcfi60:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi61:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $18, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $9
+; X86-NEXT:  .Lcfi62:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $19
+; X86-NEXT:  .Lcfi63:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi64:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $19, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $10
+; X86-NEXT:  .Lcfi65:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $20
+; X86-NEXT:  .Lcfi66:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi67:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $20, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $10
+; X86-NEXT:  .Lcfi68:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $21
+; X86-NEXT:  .Lcfi69:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi70:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $21, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $11
+; X86-NEXT:  .Lcfi71:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $22
+; X86-NEXT:  .Lcfi72:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi73:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $22, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $11
+; X86-NEXT:  .Lcfi74:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $23
+; X86-NEXT:  .Lcfi75:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi76:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $23, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $12
+; X86-NEXT:  .Lcfi77:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $24
+; X86-NEXT:  .Lcfi78:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi79:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $24, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $12
+; X86-NEXT:  .Lcfi80:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $25
+; X86-NEXT:  .Lcfi81:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi82:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $25, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $13
+; X86-NEXT:  .Lcfi83:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $26
+; X86-NEXT:  .Lcfi84:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi85:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $26, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $13
+; X86-NEXT:  .Lcfi86:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $27
+; X86-NEXT:  .Lcfi87:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi88:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $27, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $14
+; X86-NEXT:  .Lcfi89:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $28
+; X86-NEXT:  .Lcfi90:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi91:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $28, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $14
+; X86-NEXT:  .Lcfi92:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $29
+; X86-NEXT:  .Lcfi93:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi94:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $29, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    pushl $15
+; X86-NEXT:  .Lcfi95:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $30
+; X86-NEXT:  .Lcfi96:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi97:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $30, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    pushl $15
+; X86-NEXT:  .Lcfi98:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $31
+; X86-NEXT:  .Lcfi99:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi100:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    pushl $16
+; X86-NEXT:  .Lcfi101:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $32
+; X86-NEXT:  .Lcfi102:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll mult
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  .Lcfi103:
+; X86-NEXT:    .cfi_adjust_cfa_offset -8
+; X86-NEXT:    xorl $32, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    jne .LBB1_2
+; X86-NEXT:  # BB#1:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB1_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-HSW-LABEL: foo:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    pushq %rbp
+; X64-HSW-NEXT:  .Lcfi0:
+; X64-HSW-NEXT:    .cfi_def_cfa_offset 16
+; X64-HSW-NEXT:    pushq %r15
+; X64-HSW-NEXT:  .Lcfi1:
+; X64-HSW-NEXT:    .cfi_def_cfa_offset 24
+; X64-HSW-NEXT:    pushq %r14
+; X64-HSW-NEXT:  .Lcfi2:
+; X64-HSW-NEXT:    .cfi_def_cfa_offset 32
+; X64-HSW-NEXT:    pushq %r12
+; X64-HSW-NEXT:  .Lcfi3:
+; X64-HSW-NEXT:    .cfi_def_cfa_offset 40
+; X64-HSW-NEXT:    pushq %rbx
+; X64-HSW-NEXT:  .Lcfi4:
+; X64-HSW-NEXT:    .cfi_def_cfa_offset 48
+; X64-HSW-NEXT:  .Lcfi5:
+; X64-HSW-NEXT:    .cfi_offset %rbx, -48
+; X64-HSW-NEXT:  .Lcfi6:
+; X64-HSW-NEXT:    .cfi_offset %r12, -40
+; X64-HSW-NEXT:  .Lcfi7:
+; X64-HSW-NEXT:    .cfi_offset %r14, -32
+; X64-HSW-NEXT:  .Lcfi8:
+; X64-HSW-NEXT:    .cfi_offset %r15, -24
+; X64-HSW-NEXT:  .Lcfi9:
+; X64-HSW-NEXT:    .cfi_offset %rbp, -16
+; X64-HSW-NEXT:    xorl %r12d, %r12d
+; X64-HSW-NEXT:    movl $1, %edi
+; X64-HSW-NEXT:    xorl %esi, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $1, %ebx
+; X64-HSW-NEXT:    movl $2, %edi
+; X64-HSW-NEXT:    movl $1, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $2, %ebp
+; X64-HSW-NEXT:    orl %ebx, %ebp
+; X64-HSW-NEXT:    movl $3, %edi
+; X64-HSW-NEXT:    movl $1, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %r14d
+; X64-HSW-NEXT:    xorl $3, %r14d
+; X64-HSW-NEXT:    movl $4, %edi
+; X64-HSW-NEXT:    movl $2, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $4, %ebx
+; X64-HSW-NEXT:    orl %r14d, %ebx
+; X64-HSW-NEXT:    orl %ebp, %ebx
+; X64-HSW-NEXT:    movl $5, %edi
+; X64-HSW-NEXT:    movl $2, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %r14d
+; X64-HSW-NEXT:    xorl $5, %r14d
+; X64-HSW-NEXT:    movl $6, %edi
+; X64-HSW-NEXT:    movl $3, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $6, %ebp
+; X64-HSW-NEXT:    orl %r14d, %ebp
+; X64-HSW-NEXT:    movl $7, %edi
+; X64-HSW-NEXT:    movl $3, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %r14d
+; X64-HSW-NEXT:    xorl $7, %r14d
+; X64-HSW-NEXT:    orl %ebp, %r14d
+; X64-HSW-NEXT:    orl %ebx, %r14d
+; X64-HSW-NEXT:    movl $8, %edi
+; X64-HSW-NEXT:    movl $4, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $8, %ebx
+; X64-HSW-NEXT:    movl $9, %edi
+; X64-HSW-NEXT:    movl $4, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $9, %ebp
+; X64-HSW-NEXT:    orl %ebx, %ebp
+; X64-HSW-NEXT:    movl $10, %edi
+; X64-HSW-NEXT:    movl $5, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $10, %ebx
+; X64-HSW-NEXT:    orl %ebp, %ebx
+; X64-HSW-NEXT:    movl $11, %edi
+; X64-HSW-NEXT:    movl $5, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %r15d
+; X64-HSW-NEXT:    xorl $11, %r15d
+; X64-HSW-NEXT:    orl %ebx, %r15d
+; X64-HSW-NEXT:    orl %r14d, %r15d
+; X64-HSW-NEXT:    movl $12, %edi
+; X64-HSW-NEXT:    movl $6, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $12, %ebx
+; X64-HSW-NEXT:    movl $13, %edi
+; X64-HSW-NEXT:    movl $6, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $13, %ebp
+; X64-HSW-NEXT:    orl %ebx, %ebp
+; X64-HSW-NEXT:    movl $14, %edi
+; X64-HSW-NEXT:    movl $7, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $14, %ebx
+; X64-HSW-NEXT:    orl %ebp, %ebx
+; X64-HSW-NEXT:    movl $15, %edi
+; X64-HSW-NEXT:    movl $7, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $15, %ebp
+; X64-HSW-NEXT:    orl %ebx, %ebp
+; X64-HSW-NEXT:    movl $16, %edi
+; X64-HSW-NEXT:    movl $8, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %r14d
+; X64-HSW-NEXT:    xorl $16, %r14d
+; X64-HSW-NEXT:    orl %ebp, %r14d
+; X64-HSW-NEXT:    orl %r15d, %r14d
+; X64-HSW-NEXT:    movl $17, %edi
+; X64-HSW-NEXT:    movl $8, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $17, %ebp
+; X64-HSW-NEXT:    movl $18, %edi
+; X64-HSW-NEXT:    movl $9, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $18, %ebx
+; X64-HSW-NEXT:    orl %ebp, %ebx
+; X64-HSW-NEXT:    movl $19, %edi
+; X64-HSW-NEXT:    movl $9, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $19, %ebp
+; X64-HSW-NEXT:    orl %ebx, %ebp
+; X64-HSW-NEXT:    movl $20, %edi
+; X64-HSW-NEXT:    movl $10, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $20, %ebx
+; X64-HSW-NEXT:    orl %ebp, %ebx
+; X64-HSW-NEXT:    movl $21, %edi
+; X64-HSW-NEXT:    movl $10, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $21, %ebp
+; X64-HSW-NEXT:    orl %ebx, %ebp
+; X64-HSW-NEXT:    movl $22, %edi
+; X64-HSW-NEXT:    movl $11, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %r15d
+; X64-HSW-NEXT:    xorl $22, %r15d
+; X64-HSW-NEXT:    orl %ebp, %r15d
+; X64-HSW-NEXT:    orl %r14d, %r15d
+; X64-HSW-NEXT:    movl $23, %edi
+; X64-HSW-NEXT:    movl $11, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $23, %ebp
+; X64-HSW-NEXT:    movl $24, %edi
+; X64-HSW-NEXT:    movl $12, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $24, %ebx
+; X64-HSW-NEXT:    orl %ebp, %ebx
+; X64-HSW-NEXT:    movl $25, %edi
+; X64-HSW-NEXT:    movl $12, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $25, %ebp
+; X64-HSW-NEXT:    orl %ebx, %ebp
+; X64-HSW-NEXT:    movl $26, %edi
+; X64-HSW-NEXT:    movl $13, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $26, %ebx
+; X64-HSW-NEXT:    orl %ebp, %ebx
+; X64-HSW-NEXT:    movl $27, %edi
+; X64-HSW-NEXT:    movl $13, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $27, %ebp
+; X64-HSW-NEXT:    orl %ebx, %ebp
+; X64-HSW-NEXT:    movl $28, %edi
+; X64-HSW-NEXT:    movl $14, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $28, %ebx
+; X64-HSW-NEXT:    orl %ebp, %ebx
+; X64-HSW-NEXT:    movl $29, %edi
+; X64-HSW-NEXT:    movl $14, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebp
+; X64-HSW-NEXT:    xorl $29, %ebp
+; X64-HSW-NEXT:    orl %ebx, %ebp
+; X64-HSW-NEXT:    orl %r15d, %ebp
+; X64-HSW-NEXT:    movl $30, %edi
+; X64-HSW-NEXT:    movl $15, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %r14d
+; X64-HSW-NEXT:    xorl $30, %r14d
+; X64-HSW-NEXT:    movl $31, %edi
+; X64-HSW-NEXT:    movl $15, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    movl %eax, %ebx
+; X64-HSW-NEXT:    xorl $31, %ebx
+; X64-HSW-NEXT:    orl %r14d, %ebx
+; X64-HSW-NEXT:    orl %ebp, %ebx
+; X64-HSW-NEXT:    movl $32, %edi
+; X64-HSW-NEXT:    movl $16, %esi
+; X64-HSW-NEXT:    callq mult
+; X64-HSW-NEXT:    xorl $32, %eax
+; X64-HSW-NEXT:    orl %ebx, %eax
+; X64-HSW-NEXT:    movl $-1, %eax
+; X64-HSW-NEXT:    cmovel %r12d, %eax
+; X64-HSW-NEXT:    popq %rbx
+; X64-HSW-NEXT:    popq %r12
+; X64-HSW-NEXT:    popq %r14
+; X64-HSW-NEXT:    popq %r15
+; X64-HSW-NEXT:    popq %rbp
+; X64-HSW-NEXT:    retq
+  %1 = tail call i32 @mult(i32 1, i32 0)
+  %2 = icmp ne i32 %1, 1
+  %3 = tail call i32 @mult(i32 2, i32 1)
+  %4 = icmp ne i32 %3, 2
+  %5 = or i1 %2, %4
+  %6 = tail call i32 @mult(i32 3, i32 1)
+  %7 = icmp ne i32 %6, 3
+  %8 = or i1 %5, %7
+  %9 = tail call i32 @mult(i32 4, i32 2)
+  %10 = icmp ne i32 %9, 4
+  %11 = or i1 %8, %10
+  %12 = tail call i32 @mult(i32 5, i32 2)
+  %13 = icmp ne i32 %12, 5
+  %14 = or i1 %11, %13
+  %15 = tail call i32 @mult(i32 6, i32 3)
+  %16 = icmp ne i32 %15, 6
+  %17 = or i1 %14, %16
+  %18 = tail call i32 @mult(i32 7, i32 3)
+  %19 = icmp ne i32 %18, 7
+  %20 = or i1 %17, %19
+  %21 = tail call i32 @mult(i32 8, i32 4)
+  %22 = icmp ne i32 %21, 8
+  %23 = or i1 %20, %22
+  %24 = tail call i32 @mult(i32 9, i32 4)
+  %25 = icmp ne i32 %24, 9
+  %26 = or i1 %23, %25
+  %27 = tail call i32 @mult(i32 10, i32 5)
+  %28 = icmp ne i32 %27, 10
+  %29 = or i1 %26, %28
+  %30 = tail call i32 @mult(i32 11, i32 5)
+  %31 = icmp ne i32 %30, 11
+  %32 = or i1 %29, %31
+  %33 = tail call i32 @mult(i32 12, i32 6)
+  %34 = icmp ne i32 %33, 12
+  %35 = or i1 %32, %34
+  %36 = tail call i32 @mult(i32 13, i32 6)
+  %37 = icmp ne i32 %36, 13
+  %38 = or i1 %35, %37
+  %39 = tail call i32 @mult(i32 14, i32 7)
+  %40 = icmp ne i32 %39, 14
+  %41 = or i1 %38, %40
+  %42 = tail call i32 @mult(i32 15, i32 7)
+  %43 = icmp ne i32 %42, 15
+  %44 = or i1 %41, %43
+  %45 = tail call i32 @mult(i32 16, i32 8)
+  %46 = icmp ne i32 %45, 16
+  %47 = or i1 %44, %46
+  %48 = tail call i32 @mult(i32 17, i32 8)
+  %49 = icmp ne i32 %48, 17
+  %50 = or i1 %47, %49
+  %51 = tail call i32 @mult(i32 18, i32 9)
+  %52 = icmp ne i32 %51, 18
+  %53 = or i1 %50, %52
+  %54 = tail call i32 @mult(i32 19, i32 9)
+  %55 = icmp ne i32 %54, 19
+  %56 = or i1 %53, %55
+  %57 = tail call i32 @mult(i32 20, i32 10)
+  %58 = icmp ne i32 %57, 20
+  %59 = or i1 %56, %58
+  %60 = tail call i32 @mult(i32 21, i32 10)
+  %61 = icmp ne i32 %60, 21
+  %62 = or i1 %59, %61
+  %63 = tail call i32 @mult(i32 22, i32 11)
+  %64 = icmp ne i32 %63, 22
+  %65 = or i1 %62, %64
+  %66 = tail call i32 @mult(i32 23, i32 11)
+  %67 = icmp ne i32 %66, 23
+  %68 = or i1 %65, %67
+  %69 = tail call i32 @mult(i32 24, i32 12)
+  %70 = icmp ne i32 %69, 24
+  %71 = or i1 %68, %70
+  %72 = tail call i32 @mult(i32 25, i32 12)
+  %73 = icmp ne i32 %72, 25
+  %74 = or i1 %71, %73
+  %75 = tail call i32 @mult(i32 26, i32 13)
+  %76 = icmp ne i32 %75, 26
+  %77 = or i1 %74, %76
+  %78 = tail call i32 @mult(i32 27, i32 13)
+  %79 = icmp ne i32 %78, 27
+  %80 = or i1 %77, %79
+  %81 = tail call i32 @mult(i32 28, i32 14)
+  %82 = icmp ne i32 %81, 28
+  %83 = or i1 %80, %82
+  %84 = tail call i32 @mult(i32 29, i32 14)
+  %85 = icmp ne i32 %84, 29
+  %86 = or i1 %83, %85
+  %87 = tail call i32 @mult(i32 30, i32 15)
+  %88 = icmp ne i32 %87, 30
+  %89 = or i1 %86, %88
+  %90 = tail call i32 @mult(i32 31, i32 15)
+  %91 = icmp ne i32 %90, 31
+  %92 = or i1 %89, %91
+  %93 = tail call i32 @mult(i32 32, i32 16)
+  %94 = icmp ne i32 %93, 32
+  %95 = or i1 %92, %94
+  %96 = sext i1 %95 to i32
+  ret i32 %96
+}
+
+attributes #0 = { norecurse nounwind readnone uwtable  }
Index: test/MC/MachO/alias.s
===================================================================
--- /dev/null
+++ test/MC/MachO/alias.s
@@ -0,0 +1,12 @@
+// RUN: llvm-mc -triple x86_64-apple-macosx10.12.0 %s -filetype=obj | llvm-readobj -r | FileCheck %s
+
+l_a:
+l_b = l_a
+l_c = l_b
+        .long l_c
+
+// CHECK:      Relocations [
+// CHECK-NEXT:   Section __text {
+// CHECK-NEXT:     0x0 0 2 1 X86_64_RELOC_UNSIGNED 0 l_c
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
Index: test/MC/MachO/variable-exprs.s
===================================================================
--- test/MC/MachO/variable-exprs.s
+++ test/MC/MachO/variable-exprs.s
@@ -110,8 +110,8 @@
 // CHECK-I386:     0x2C 0 2 0 GENERIC_RELOC_VANILLA 0 __data
 // CHECK-I386:     0x28 0 2 0 GENERIC_RELOC_VANILLA 0 __data
 // CHECK-I386:     0x24 0 2 1 GENERIC_RELOC_VANILLA 0 d3
-// CHECK-I386:     0x20 0 2 1 GENERIC_RELOC_VANILLA 0 d2
-// CHECK-I386:     0x1C 0 2 1 GENERIC_RELOC_VANILLA 0 d
+// CHECK-I386:     0x20 0 2 1 GENERIC_RELOC_VANILLA 0 d{{$}}
+// CHECK-I386:     0x1C 0 2 1 GENERIC_RELOC_VANILLA 0 d{{$}}
 // CHECK-I386:     0x18 0 2 n/a GENERIC_RELOC_VANILLA 1 0x5
 // CHECK-I386:     0x14 0 2 0 GENERIC_RELOC_VANILLA 0 __data
 // CHECK-I386:     0x10 0 2 0 GENERIC_RELOC_VANILLA 0 __data
@@ -319,8 +319,8 @@
 // CHECK-X86_64:     0x2C 0 2 1 X86_64_RELOC_UNSIGNED 0 g
 // CHECK-X86_64:     0x28 0 2 1 X86_64_RELOC_UNSIGNED 0 f
 // CHECK-X86_64:     0x24 0 2 1 X86_64_RELOC_UNSIGNED 0 d3
-// CHECK-X86_64:     0x20 0 2 1 X86_64_RELOC_UNSIGNED 0 d2
-// CHECK-X86_64:     0x1C 0 2 1 X86_64_RELOC_UNSIGNED 0 d
+// CHECK-X86_64:     0x20 0 2 1 X86_64_RELOC_UNSIGNED 0 d{{$}}
+// CHECK-X86_64:     0x1C 0 2 1 X86_64_RELOC_UNSIGNED 0 d{{$}}
 // CHECK-X86_64:     0x18 0 2 1 X86_64_RELOC_UNSIGNED 0 a
 // CHECK-X86_64:     0x14 0 2 1 X86_64_RELOC_UNSIGNED 0 e
 // CHECK-X86_64:     0x10 0 2 1 X86_64_RELOC_UNSIGNED 0 b
Index: test/Transforms/CodeGenPrepare/X86/memcmp.ll
===================================================================
--- /dev/null
+++ test/Transforms/CodeGenPrepare/X86/memcmp.ll
@@ -0,0 +1,337 @@
+; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown < %s   | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i32 @memcmp(i8* nocapture, i8* nocapture, i64)
+
+define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp2(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
+  ret i32 %call
+}
+
+define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp3(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
+  ret i32 %call
+}
+
+define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp4(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
+  ret i32 %call
+}
+
+define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp5(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
+  ret i32 %call
+}
+
+define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp6(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
+  ret i32 %call
+}
+
+define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp7(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
+  ret i32 %call
+}
+
+define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp8(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+  ret i32 %call
+}
+
+define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp9(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
+  ret i32 %call
+}
+
+define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp10(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
+  ret i32 %call
+}
+
+define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp11(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
+  ret i32 %call
+}
+
+define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp12(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
+  ret i32 %call
+}
+
+define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp13(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
+  ret i32 %call
+}
+
+define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp14(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
+  ret i32 %call
+}
+
+define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp15(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
+  ret i32 %call
+}
+
+define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp16(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
+; ALL-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
+  ret i32 %call
+}
+
+define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq2(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq3(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq4(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq5(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq6(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq7(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq8(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq9(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq10(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq11(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq12(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq13(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq14(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq15(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; ALL-LABEL: @cmp_eq16(
+; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; ALL-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
Index: test/Transforms/GVNSink/sink-common-code.ll
===================================================================
--- test/Transforms/GVNSink/sink-common-code.ll
+++ test/Transforms/GVNSink/sink-common-code.ll
@@ -54,33 +54,36 @@
 
 declare i32 @foo(i32, i32) nounwind readnone
 
-define i32 @test3(i1 zeroext %flag, i32 %x, i32 %y) {
-entry:
-  br i1 %flag, label %if.then, label %if.else
-
-if.then:
-  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
-  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
-  br label %if.end
-
-if.else:
-  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
-  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
-  br label %if.end
-
-if.end:
-  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
-  %yy = phi i32 [ %y0, %if.then ], [ %y1, %if.else ]
-  %ret = add i32 %xx, %yy
-  ret i32 %ret
-}
-
-; CHECK-LABEL: test3
-; CHECK: select
-; CHECK: call
-; CHECK: call
-; CHECK: add
-; CHECK-NOT: br
+; FIXME: The test failes when the original order of the
+; candidates with the same cost is preserved.
+;
+;define i32 @test3(i1 zeroext %flag, i32 %x, i32 %y) {
+;entry:
+;  br i1 %flag, label %if.then, label %if.else
+;
+;if.then:
+;  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+;  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+;  br label %if.end
+;
+;if.else:
+;  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+;  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+;  br label %if.end
+;
+;if.end:
+;  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+;  %yy = phi i32 [ %y0, %if.then ], [ %y1, %if.else ]
+;  %ret = add i32 %xx, %yy
+;  ret i32 %ret
+;}
+;
+; -CHECK-LABEL: test3
+; -CHECK: select
+; -CHECK: call
+; -CHECK: call
+; -CHECK: add
+; -CHECK-NOT: br
 
 define i32 @test4(i1 zeroext %flag, i32 %x, i32* %y) {
 entry:
Index: test/Transforms/InferAddressSpaces/NVPTX/clone_constexpr.ll
===================================================================
--- /dev/null
+++ test/Transforms/InferAddressSpaces/NVPTX/clone_constexpr.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -mtriple=nvptx64-nvidia-cuda -infer-address-spaces %s | FileCheck %s
+
+%struct.S = type { [5 x i32] }
+
+$g1 = comdat any
+
+@g1 = linkonce_odr addrspace(3) global %struct.S zeroinitializer, comdat, align 4
+
+; CHECK-LABEL: @foo(
+; CHECK:  %x0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+; CHECK:  %idxprom.i = zext i32 %x0 to i64
+; CHECK:  %arrayidx.i = getelementptr %struct.S, %struct.S* addrspacecast (%struct.S addrspace(3)* @g1 to %struct.S*), i64 0, i32 0, i64 %idxprom.i
+; CHECK:  tail call void @f1(i32* %arrayidx.i, i32 undef) #0
+; CHECK:  %x1 = load i32, i32* getelementptr (%struct.S, %struct.S* addrspacecast (%struct.S addrspace(3)* @g1 to %struct.S*), i64 0, i32 0, i64 0), align 4
+; CHECK:  %L.sroa.0.0.insert.ext.i = zext i32 %x1 to i64
+; CHECK:  tail call void @f2(i64* null, i64 %L.sroa.0.0.insert.ext.i) #0
+; CHECK:  ret void
+define void @foo() local_unnamed_addr #0 {
+entry:
+  %x0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+  %idxprom.i = zext i32 %x0 to i64
+  %arrayidx.i = getelementptr %struct.S, %struct.S* addrspacecast (%struct.S addrspace(3)* @g1 to %struct.S*), i64 0, i32 0, i64 %idxprom.i
+  tail call void @f1(i32* %arrayidx.i, i32 undef) #0
+  %x1 = load i32, i32* getelementptr (%struct.S, %struct.S* addrspacecast (%struct.S addrspace(3)* @g1 to %struct.S*), i64 0, i32 0, i64 0), align 4
+  %L.sroa.0.0.insert.ext.i = zext i32 %x1 to i64
+  tail call void @f2(i64* null, i64 %L.sroa.0.0.insert.ext.i) #0
+  ret void
+}
+
+declare void @f1(i32*, i32) local_unnamed_addr #0
+declare void @f2(i64*, i64) local_unnamed_addr #0
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
Index: test/Transforms/SLPVectorizer/X86/arith-fp.ll
===================================================================
--- test/Transforms/SLPVectorizer/X86/arith-fp.ll
+++ test/Transforms/SLPVectorizer/X86/arith-fp.ll
@@ -10,7 +10,7 @@
 
 define <2 x double> @buildvector_add_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_2f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -30,7 +30,7 @@
 
 define <2 x double> @buildvector_sub_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_2f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -50,7 +50,7 @@
 
 define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_2f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -70,7 +70,7 @@
 
 define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_div_2f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -90,7 +90,7 @@
 
 define <4 x float> @buildvector_add_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -122,7 +122,7 @@
 
 define <4 x float> @buildvector_sub_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -154,7 +154,7 @@
 
 define <4 x float> @buildvector_mul_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -186,7 +186,7 @@
 
 define <4 x float> @buildvector_div_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -222,7 +222,7 @@
 
 define <4 x double> @buildvector_add_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
@@ -254,7 +254,7 @@
 
 define <4 x double> @buildvector_sub_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
@@ -286,7 +286,7 @@
 
 define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
@@ -318,7 +318,7 @@
 
 define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_div_4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
@@ -350,7 +350,7 @@
 
 define <8 x float> @buildvector_add_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
@@ -406,7 +406,7 @@
 
 define <8 x float> @buildvector_sub_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
@@ -462,7 +462,7 @@
 
 define <8 x float> @buildvector_mul_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
@@ -518,7 +518,7 @@
 
 define <8 x float> @buildvector_div_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
@@ -578,7 +578,7 @@
 
 define <8 x double> @buildvector_add_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_8f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
@@ -634,7 +634,7 @@
 
 define <8 x double> @buildvector_sub_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_8f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
@@ -690,7 +690,7 @@
 
 define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_8f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
@@ -746,7 +746,7 @@
 
 define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_div_8f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
@@ -802,7 +802,7 @@
 
 define <16 x float> @buildvector_add_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_16f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
@@ -906,7 +906,7 @@
 
 define <16 x float> @buildvector_sub_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_16f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
@@ -1010,7 +1010,7 @@
 
 define <16 x float> @buildvector_mul_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_16f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
@@ -1114,7 +1114,7 @@
 
 define <16 x float> @buildvector_div_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_16f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
Index: test/Transforms/SampleProfile/Inputs/indirect-call.prof
===================================================================
--- test/Transforms/SampleProfile/Inputs/indirect-call.prof
+++ test/Transforms/SampleProfile/Inputs/indirect-call.prof
@@ -17,3 +17,6 @@
 test_inline_strip_conflict:3000:0
  1: foo_inline_strip_conflict:3000
   1: 3000
+test_norecursive_inline:3000:0
+ 1: test_norecursive_inline:3000
+  20: 3000
Index: test/Transforms/SampleProfile/indirect-call.ll
===================================================================
--- test/Transforms/SampleProfile/indirect-call.ll
+++ test/Transforms/SampleProfile/indirect-call.ll
@@ -69,7 +69,18 @@
   ret void
 }
 
+; CHECK-LABEL: @test_norecursive_inline
+; If the indirect call target is the caller, we should not promote it.
+define void @test_norecursive_inline() !dbg !24 {
+; CHECK-NOT: icmp
+; CHECK: call
+  %1 = load void ()*, void ()** @y, align 8
+  call void %1(), !dbg !25
+  ret void
+}
+
 @x = global i32 0, align 4
+@y = global void ()* null, align 8
 
 define i32* @foo_inline1(i32* %x) !dbg !14 {
   ret i32* %x
@@ -142,3 +153,5 @@
 !21 = distinct !DISubprogram(name: "foo_direct", scope: !1, file: !1, line: 21, unit: !0)
 !22 = distinct !DISubprogram(name: "test_direct", scope: !1, file: !1, line: 22, unit: !0)
 !23 = !DILocation(line: 23, scope: !22)
+!24 = distinct !DISubprogram(name: "test_norecursive_inline", scope: !1, file: !1, line: 12, unit: !0)
+!25 = !DILocation(line: 13, scope: !24)
Index: tools/llvm-pdbdump/PrettyTypeDumper.cpp
===================================================================
--- tools/llvm-pdbdump/PrettyTypeDumper.cpp
+++ tools/llvm-pdbdump/PrettyTypeDumper.cpp
@@ -135,80 +135,84 @@
 TypeDumper::TypeDumper(LinePrinter &P) : PDBSymDumper(true), Printer(P) {}
 
 void TypeDumper::start(const PDBSymbolExe &Exe) {
+  auto Children = Exe.findAllChildren();
   if (opts::pretty::Enums) {
-    auto Enums = Exe.findAllChildren<PDBSymbolTypeEnum>();
-    Printer.NewLine();
-    WithColor(Printer, PDB_ColorItem::Identifier).get() << "Enums";
-    Printer << ": (" << Enums->getChildCount() << " items)";
-    Printer.Indent();
-    while (auto Enum = Enums->getNext())
-      Enum->dump(*this);
-    Printer.Unindent();
+    if (auto Enums = Exe.findAllChildren<PDBSymbolTypeEnum>()) {
+      Printer.NewLine();
+      WithColor(Printer, PDB_ColorItem::Identifier).get() << "Enums";
+      Printer << ": (" << Enums->getChildCount() << " items)";
+      Printer.Indent();
+      while (auto Enum = Enums->getNext())
+        Enum->dump(*this);
+      Printer.Unindent();
+    }
   }
 
   if (opts::pretty::Typedefs) {
-    auto Typedefs = Exe.findAllChildren<PDBSymbolTypeTypedef>();
-    Printer.NewLine();
-    WithColor(Printer, PDB_ColorItem::Identifier).get() << "Typedefs";
-    Printer << ": (" << Typedefs->getChildCount() << " items)";
-    Printer.Indent();
-    while (auto Typedef = Typedefs->getNext())
-      Typedef->dump(*this);
-    Printer.Unindent();
+    if (auto Typedefs = Exe.findAllChildren<PDBSymbolTypeTypedef>()) {
+      Printer.NewLine();
+      WithColor(Printer, PDB_ColorItem::Identifier).get() << "Typedefs";
+      Printer << ": (" << Typedefs->getChildCount() << " items)";
+      Printer.Indent();
+      while (auto Typedef = Typedefs->getNext())
+        Typedef->dump(*this);
+      Printer.Unindent();
+    }
   }
 
   if (opts::pretty::Classes) {
-    auto Classes = Exe.findAllChildren<PDBSymbolTypeUDT>();
-    uint32_t All = Classes->getChildCount();
-
-    Printer.NewLine();
-    WithColor(Printer, PDB_ColorItem::Identifier).get() << "Classes";
-
-    bool Precompute = false;
-    Precompute =
-        (opts::pretty::ClassOrder != opts::pretty::ClassSortMode::None);
-
-    // If we're using no sort mode, then we can start getting immediate output
-    // from the tool by just filtering as we go, rather than processing
-    // everything up front so that we can sort it.  This makes the tool more
-    // responsive.  So only precompute the filtered/sorted set of classes if
-    // necessary due to the specified options.
-    std::vector<LayoutPtr> Filtered;
-    uint32_t Shown = All;
-    if (Precompute) {
-      Filtered = filterAndSortClassDefs(Printer, *Classes, All);
-
-      Shown = Filtered.size();
-    }
+    if (auto Classes = Exe.findAllChildren<PDBSymbolTypeUDT>()) {
+      uint32_t All = Classes->getChildCount();
+
+      Printer.NewLine();
+      WithColor(Printer, PDB_ColorItem::Identifier).get() << "Classes";
+
+      bool Precompute = false;
+      Precompute =
+          (opts::pretty::ClassOrder != opts::pretty::ClassSortMode::None);
+
+      // If we're using no sort mode, then we can start getting immediate output
+      // from the tool by just filtering as we go, rather than processing
+      // everything up front so that we can sort it.  This makes the tool more
+      // responsive.  So only precompute the filtered/sorted set of classes if
+      // necessary due to the specified options.
+      std::vector<LayoutPtr> Filtered;
+      uint32_t Shown = All;
+      if (Precompute) {
+        Filtered = filterAndSortClassDefs(Printer, *Classes, All);
+
+        Shown = Filtered.size();
+      }
 
-    Printer << ": (Showing " << Shown << " items";
-    if (Shown < All)
-      Printer << ", " << (All - Shown) << " filtered";
-    Printer << ")";
-    Printer.Indent();
-
-    // If we pre-computed, iterate the filtered/sorted list, otherwise iterate
-    // the DIA enumerator and filter on the fly.
-    if (Precompute) {
-      for (auto &Class : Filtered)
-        dumpClassLayout(*Class);
-    } else {
-      while (auto Class = Classes->getNext()) {
-        if (Class->getUnmodifiedTypeId() != 0)
-          continue;
-
-        if (Printer.IsTypeExcluded(Class->getName(), Class->getLength()))
-          continue;
-
-        auto Layout = llvm::make_unique<ClassLayout>(std::move(Class));
-        if (Layout->deepPaddingSize() < opts::pretty::PaddingThreshold)
-          continue;
-
-        dumpClassLayout(*Layout);
+      Printer << ": (Showing " << Shown << " items";
+      if (Shown < All)
+        Printer << ", " << (All - Shown) << " filtered";
+      Printer << ")";
+      Printer.Indent();
+
+      // If we pre-computed, iterate the filtered/sorted list, otherwise iterate
+      // the DIA enumerator and filter on the fly.
+      if (Precompute) {
+        for (auto &Class : Filtered)
+          dumpClassLayout(*Class);
+      } else {
+        while (auto Class = Classes->getNext()) {
+          if (Class->getUnmodifiedTypeId() != 0)
+            continue;
+
+          if (Printer.IsTypeExcluded(Class->getName(), Class->getLength()))
+            continue;
+
+          auto Layout = llvm::make_unique<ClassLayout>(std::move(Class));
+          if (Layout->deepPaddingSize() < opts::pretty::PaddingThreshold)
+            continue;
+
+          dumpClassLayout(*Layout);
+        }
       }
-    }
 
-    Printer.Unindent();
+      Printer.Unindent();
+    }
   }
 }