diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -48,6 +48,7 @@
 class GlobalValue;
 class GlobalVariable;
 class MachineBasicBlock;
+class MachineBlockFrequencyInfo;
 class MachineConstantPoolValue;
 class MachineDominatorTree;
 class MachineFunction;
@@ -69,6 +70,7 @@
 class MCTargetOptions;
 class MDNode;
 class Module;
+class ProfileSummaryInfo;
 class raw_ostream;
 class RemarkStreamer;
 class StackMaps;
@@ -108,6 +110,10 @@
   /// Optimization remark emitter.
   MachineOptimizationRemarkEmitter *ORE;
 
+  MachineBlockFrequencyInfo *MBFI;
+
+  ProfileSummaryInfo *PSI;
+
   /// The symbol for the current function. This is recalculated at the beginning
   /// of each call to runOnMachineFunction().
   MCSymbol *CurrentFnSym = nullptr;
diff --git a/llvm/include/llvm/CodeGen/TailDuplicator.h b/llvm/include/llvm/CodeGen/TailDuplicator.h
--- a/llvm/include/llvm/CodeGen/TailDuplicator.h
+++ b/llvm/include/llvm/CodeGen/TailDuplicator.h
@@ -25,11 +25,13 @@
 namespace llvm {
 
 class MachineBasicBlock;
+class MachineBlockFrequencyInfo;
 class MachineBranchProbabilityInfo;
 class MachineFunction;
 class MachineInstr;
 class MachineModuleInfo;
 class MachineRegisterInfo;
+class ProfileSummaryInfo;
 class TargetRegisterInfo;
 
 /// Utility class to perform tail duplication.
@@ -40,6 +42,8 @@
   const MachineModuleInfo *MMI;
   MachineRegisterInfo *MRI;
   MachineFunction *MF;
+  const MachineBlockFrequencyInfo *MBFI;
+  ProfileSummaryInfo *PSI;
   bool PreRegAlloc;
   bool LayoutMode;
   unsigned TailDupSize;
@@ -65,6 +69,8 @@
   ///     default implies using the command line value TailDupSize.
   void initMF(MachineFunction &MF, bool PreRegAlloc,
               const MachineBranchProbabilityInfo *MBPI,
+              const MachineBlockFrequencyInfo *MBFI,
+              ProfileSummaryInfo *PSI,
               bool LayoutMode, unsigned TailDupSize = 0);
 
   bool tailDuplicateBlocks();
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -31,13 +31,16 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -52,6 +55,7 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -248,6 +252,8 @@
   AU.addRequired<MachineModuleInfoWrapperPass>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
+  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
@@ -1684,6 +1690,10 @@
   }
 
   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MBFI = (PSI && PSI->hasProfileSummary()) ?
+         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+         nullptr;
 }
 
 namespace {
@@ -2913,8 +2923,10 @@
 void AsmPrinter::setupCodePaddingContext(const MachineBasicBlock &MBB,
                                          MCCodePaddingContext &Context) const {
   assert(MF != nullptr && "Machine function must be valid");
+  bool OptForSize = MF->getFunction().hasOptSize() ||
+                    llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
   Context.IsPaddingActive = !MF->hasInlineAsm() &&
-                            !MF->getFunction().hasOptSize() &&
+                            !OptForSize &&
                             TM.getOptLevel() != CodeGenOpt::None;
   Context.IsBasicBlockReachableViaFallthrough =
       std::find(MBB.pred_begin(), MBB.pred_end(), MBB.getPrevNode()) !=
diff --git a/llvm/lib/CodeGen/BranchFolding.h b/llvm/lib/CodeGen/BranchFolding.h
--- a/llvm/lib/CodeGen/BranchFolding.h
+++ b/llvm/lib/CodeGen/BranchFolding.h
@@ -27,6 +27,7 @@
 class MachineLoopInfo;
 class MachineModuleInfo;
 class MachineRegisterInfo;
+class ProfileSummaryInfo;
 class raw_ostream;
 class TargetInstrInfo;
 class TargetRegisterInfo;
@@ -39,6 +40,7 @@
                           bool CommonHoist,
                           MBFIWrapper &FreqInfo,
                           const MachineBranchProbabilityInfo &ProbInfo,
+                          ProfileSummaryInfo *PSI,
                           // Min tail length to merge. Defaults to commandline
                           // flag. Ignored for optsize.
                           unsigned MinTailLength = 0);
@@ -145,6 +147,7 @@
                                   const BlockFrequency Freq) const;
       void view(const Twine &Name, bool isSimple = true);
       uint64_t getEntryFreq() const;
+      const MachineBlockFrequencyInfo &getMBFI() { return MBFI; }
 
     private:
       const MachineBlockFrequencyInfo &MBFI;
@@ -154,6 +157,7 @@
   private:
     MBFIWrapper &MBBFreqInfo;
     const MachineBranchProbabilityInfo &MBPI;
+    ProfileSummaryInfo *PSI;
 
     bool TailMergeBlocks(MachineFunction &MF);
     bool TryTailMergeBlocks(MachineBasicBlock* SuccBB,
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -38,6 +39,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -103,6 +105,7 @@
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBlockFrequencyInfo>();
       AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<ProfileSummaryInfoWrapperPass>();
       AU.addRequired<TargetPassConfig>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -129,7 +132,8 @@
   BranchFolder::MBFIWrapper MBBFreqInfo(
       getAnalysis<MachineBlockFrequencyInfo>());
   BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo,
-                      getAnalysis<MachineBranchProbabilityInfo>());
+                      getAnalysis<MachineBranchProbabilityInfo>(),
+                      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
   auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
   return Folder.OptimizeFunction(
       MF, MF.getSubtarget().getInstrInfo(), MF.getSubtarget().getRegisterInfo(),
@@ -139,9 +143,10 @@
 BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
                            MBFIWrapper &FreqInfo,
                            const MachineBranchProbabilityInfo &ProbInfo,
+                           ProfileSummaryInfo *PSI,
                            unsigned MinTailLength)
     : EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength),
-      MBBFreqInfo(FreqInfo), MBPI(ProbInfo) {
+      MBBFreqInfo(FreqInfo), MBPI(ProbInfo), PSI(PSI) {
   if (MinCommonTailLength == 0)
     MinCommonTailLength = TailMergeSize;
   switch (FlagEnableTailMerge) {
@@ -585,7 +590,9 @@
                   MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB,
                   MachineBasicBlock *PredBB,
                   DenseMap<const MachineBasicBlock *, int> &EHScopeMembership,
-                  bool AfterPlacement) {
+                  bool AfterPlacement,
+                  BranchFolder::MBFIWrapper &MBBFreqInfo,
+                  ProfileSummaryInfo *PSI) {
   // It is never profitable to tail-merge blocks from two different EH scopes.
   if (!EHScopeMembership.empty()) {
     auto EHScope1 = EHScopeMembership.find(MBB1);
@@ -682,7 +689,11 @@
   // branch instruction, which is likely to be smaller than the 2
   // instructions that would be deleted in the merge.
   MachineFunction *MF = MBB1->getParent();
-  return EffectiveTailLen >= 2 && MF->getFunction().hasOptSize() &&
+  bool OptForSize =
+      MF->getFunction().hasOptSize() ||
+      (llvm::shouldOptimizeForSize(MBB1, PSI, &MBBFreqInfo.getMBFI()) &&
+       llvm::shouldOptimizeForSize(MBB2, PSI, &MBBFreqInfo.getMBFI()));
+  return EffectiveTailLen >= 2 && OptForSize &&
          (FullBlockTail1 || FullBlockTail2);
 }
 
@@ -704,7 +715,7 @@
                             CommonTailLen, TrialBBI1, TrialBBI2,
                             SuccBB, PredBB,
                             EHScopeMembership,
-                            AfterBlockPlacement)) {
+                            AfterBlockPlacement, MBBFreqInfo, PSI)) {
         if (CommonTailLen > maxCommonTailLength) {
           SameTails.clear();
           maxCommonTailLength = CommonTailLen;
@@ -1534,8 +1545,10 @@
     }
   }
 
-  if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 &&
-      MF.getFunction().hasOptSize()) {
+  bool OptForSize =
+      MF.getFunction().hasOptSize() ||
+      llvm::shouldOptimizeForSize(MBB, PSI, &MBBFreqInfo.getMBFI());
+  if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && OptForSize) {
     // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch
     // direction, thereby defeating careful block placement and regressing
     // performance. Therefore, only consider this for optsize functions.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -90,6 +90,7 @@
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -256,6 +257,7 @@
     const LoopInfo *LI;
     std::unique_ptr<BlockFrequencyInfo> BFI;
     std::unique_ptr<BranchProbabilityInfo> BPI;
+    ProfileSummaryInfo *PSI;
 
     /// As we scan instructions optimizing them, this is the next instruction
     /// to optimize. Transforms that can invalidate this should update it.
@@ -298,7 +300,7 @@
     /// Keep track of SExt promoted.
     ValueToSExts ValToSExtendedUses;
 
-    /// True if optimizing for size.
+    /// True if the function has the OptSize attribute.
     bool OptSize;
 
     /// DataLayout for the Function being processed.
@@ -434,10 +436,8 @@
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   BPI.reset(new BranchProbabilityInfo(F, *LI));
   BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   OptSize = F.hasOptSize();
-
-  ProfileSummaryInfo *PSI =
-      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   if (ProfileGuidedSectionPrefix) {
     if (PSI->isFunctionHotInCallGraph(&F, *BFI))
       F.setSectionPrefix(".hot");
@@ -456,7 +456,9 @@
       // bypassSlowDivision may create new BBs, but we don't want to reapply the
       // optimization to those blocks.
       BasicBlock* Next = BB->getNextNode();
-      EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
+      // F.hasOptSize is already checked in the outer if statement.
+      if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
+        EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
       BB = Next;
     }
   }
@@ -1937,7 +1939,8 @@
   // cold block.  This interacts with our handling for loads and stores to
   // ensure that we can fold all uses of a potential addressing computation
   // into their uses.  TODO: generalize this to work over profiling data
-  if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+  bool OptForSize = OptSize || llvm::shouldOptimizeForSize(BB, PSI, BFI.get());
+  if (!OptForSize && CI->hasFnAttr(Attribute::Cold))
     for (auto &Arg : CI->arg_operands()) {
       if (!Arg->getType()->isPointerTy())
         continue;
@@ -2872,16 +2875,24 @@
   /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
   bool IgnoreProfitability;
 
+  /// True if we are optimizing for size.
+  bool OptSize;
+
+  ProfileSummaryInfo *PSI;
+  BlockFrequencyInfo *BFI;
+
   AddressingModeMatcher(
       SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
       const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI,
       ExtAddrMode &AM, const SetOfInstrs &InsertedInsts,
       InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
-      std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP)
+      std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
+      bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
       : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
         DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
         MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
-        PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP) {
+        PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP),
+        OptSize(OptSize), PSI(PSI), BFI(BFI) {
     IgnoreProfitability = false;
   }
 
@@ -2899,12 +2910,14 @@
         const TargetLowering &TLI, const TargetRegisterInfo &TRI,
         const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
         TypePromotionTransaction &TPT,
-        std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP) {
+        std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
+        bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
     ExtAddrMode Result;
 
     bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS,
                                          MemoryInst, Result, InsertedInsts,
-                                         PromotedInsts, TPT, LargeOffsetGEP)
+                                         PromotedInsts, TPT, LargeOffsetGEP,
+                                         OptSize, PSI, BFI)
                        .matchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
     return Result;
@@ -4515,7 +4528,8 @@
     Instruction *I,
     SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
     SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
-    const TargetRegisterInfo &TRI, int SeenInsts = 0) {
+    const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI, int SeenInsts = 0) {
   // If we already considered this instruction, we're done.
   if (!ConsideredInsts.insert(I).second)
     return false;
@@ -4524,8 +4538,6 @@
   if (!MightBeFoldableInst(I))
     return true;
 
-  const bool OptSize = I->getFunction()->hasOptSize();
-
   // Loop over all the uses, recursively processing them.
   for (Use &U : I->uses()) {
     // Conservatively return true if we're seeing a large number or a deep chain
@@ -4566,7 +4578,9 @@
     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
       // If this is a cold call, we can sink the addressing calculation into
       // the cold path.  See optimizeCallInst
-      if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+      bool OptForSize = OptSize ||
+          llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
+      if (!OptForSize && CI->hasFnAttr(Attribute::Cold))
         continue;
 
       InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
@@ -4578,8 +4592,8 @@
       continue;
     }
 
-    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI,
-                          SeenInsts))
+    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
+                          PSI, BFI, SeenInsts))
       return true;
   }
 
@@ -4667,7 +4681,8 @@
   // the use is just a particularly nice way of sinking it.
   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
   SmallPtrSet<Instruction*, 16> ConsideredInsts;
-  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI))
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
+                        PSI, BFI))
     return false;  // Has a non-memory, non-foldable use!
 
   // Now that we know that all uses of this instruction are part of a chain of
@@ -4703,7 +4718,7 @@
         TPT.getRestorationPoint();
     AddressingModeMatcher Matcher(
         MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result,
-        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP);
+        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI);
     Matcher.IgnoreProfitability = true;
     bool Success = Matcher.matchAddr(Address, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -4809,7 +4824,8 @@
                                                                       0);
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
         V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
-        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP);
+        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
+        BFI.get());
 
     GetElementPtrInst *GEP = LargeOffsetGEP.first;
     if (GEP && !NewGEPBases.count(GEP)) {
@@ -6027,7 +6043,9 @@
 /// turn it into a branch.
 bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   // If branch conversion isn't desirable, exit early.
-  if (DisableSelectToBranch || OptSize || !TLI)
+  if (DisableSelectToBranch ||
+      OptSize || llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()) ||
+      !TLI)
     return false;
 
   // Find all consecutive select instructions that share the same condition.
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -13,6 +13,8 @@
 
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -21,6 +23,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 
 using namespace llvm;
 
@@ -721,7 +724,8 @@
 ///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
 ///  ret i32 %phi.res
 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
-                         const TargetLowering *TLI, const DataLayout *DL) {
+                         const TargetLowering *TLI, const DataLayout *DL,
+                         ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
   NumMemCmpCalls++;
 
   // Early exit from expansion if -Oz.
@@ -742,18 +746,20 @@
   // TTI call to check if target would like to expand memcmp. Also, get the
   // available load sizes.
   const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
-  auto Options = TTI->enableMemCmpExpansion(CI->getFunction()->hasOptSize(),
+  bool OptForSize = CI->getFunction()->hasOptSize() ||
+                    llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
+  auto Options = TTI->enableMemCmpExpansion(OptForSize,
                                             IsUsedForZeroCmp);
   if (!Options) return false;
 
   if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
     Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
 
-  if (CI->getFunction()->hasOptSize() &&
+  if (OptForSize &&
       MaxLoadsPerMemcmpOptSize.getNumOccurrences())
     Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
 
-  if (!CI->getFunction()->hasOptSize() && MaxLoadsPerMemcmp.getNumOccurrences())
+  if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences())
     Options.MaxNumLoads = MaxLoadsPerMemcmp;
 
   MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL);
@@ -799,7 +805,11 @@
         &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     const TargetTransformInfo *TTI =
         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    auto PA = runImpl(F, TLI, TTI, TL);
+    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+    auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+           &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
+           nullptr;
+    auto PA = runImpl(F, TLI, TTI, TL, PSI, BFI);
     return !PA.areAllPreserved();
   }
 
@@ -807,22 +817,26 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
     FunctionPass::getAnalysisUsage(AU);
   }
 
   PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
                             const TargetTransformInfo *TTI,
-                            const TargetLowering* TL);
+                            const TargetLowering* TL,
+                            ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI);
   // Returns true if a change was made.
   bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
                   const TargetTransformInfo *TTI, const TargetLowering* TL,
-                  const DataLayout& DL);
+                  const DataLayout& DL, ProfileSummaryInfo *PSI,
+                  BlockFrequencyInfo *BFI);
 };
 
 bool ExpandMemCmpPass::runOnBlock(
     BasicBlock &BB, const TargetLibraryInfo *TLI,
     const TargetTransformInfo *TTI, const TargetLowering* TL,
-    const DataLayout& DL) {
+    const DataLayout& DL, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
   for (Instruction& I : BB) {
     CallInst *CI = dyn_cast<CallInst>(&I);
     if (!CI) {
@@ -831,7 +845,7 @@
     LibFunc Func;
     if (TLI->getLibFunc(ImmutableCallSite(CI), Func) &&
         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
-        expandMemCmp(CI, TTI, TL, &DL)) {
+        expandMemCmp(CI, TTI, TL, &DL, PSI, BFI)) {
       return true;
     }
   }
@@ -841,11 +855,12 @@
 
 PreservedAnalyses ExpandMemCmpPass::runImpl(
     Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI,
-    const TargetLowering* TL) {
+    const TargetLowering* TL, ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI) {
   const DataLayout& DL = F.getParent()->getDataLayout();
   bool MadeChanges = false;
   for (auto BBIt = F.begin(); BBIt != F.end();) {
-    if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) {
+    if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI)) {
       MadeChanges = true;
       // If changes were made, restart the function from the beginning, since
       // the structure of the function was changed.
@@ -864,6 +879,8 @@
                       "Expand memcmp() to load/stores", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp",
                     "Expand memcmp() to load/stores", false, false)
 
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -213,6 +214,7 @@
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBlockFrequencyInfo>();
       AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<ProfileSummaryInfoWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -434,6 +436,7 @@
 
 INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false)
 
 bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
@@ -446,6 +449,8 @@
   TRI = ST.getRegisterInfo();
   BranchFolder::MBFIWrapper MBFI(getAnalysis<MachineBlockFrequencyInfo>());
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  ProfileSummaryInfo *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   MRI = &MF.getRegInfo();
   SchedModel.init(&ST);
 
@@ -456,7 +461,7 @@
   bool BFChange = false;
   if (!PreRegAlloc) {
     // Tail merge tend to expose more if-conversion opportunities.
-    BranchFolder BF(true, false, MBFI, *MBPI);
+    BranchFolder BF(true, false, MBFI, *MBPI, PSI);
     auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
     BFChange = BF.OptimizeFunction(
         MF, TII, ST.getRegisterInfo(),
@@ -598,7 +603,7 @@
   BBAnalysis.clear();
 
   if (MadeChange && IfCvtBranchFold) {
-    BranchFolder BF(false, false, MBFI, *MBPI);
+    BranchFolder BF(false, false, MBFI, *MBPI, PSI);
     auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
     BF.OptimizeFunction(
         MF, TII, MF.getSubtarget().getRegisterInfo(),
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
@@ -41,6 +42,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/TailDuplicator.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -363,6 +365,8 @@
   /// A handle to the post dominator tree.
   MachinePostDominatorTree *MPDT;
 
+  ProfileSummaryInfo *PSI;
+
   /// Duplicator used to duplicate tails during placement.
   ///
   /// Placement decisions can open up new tail duplication opportunities, but
@@ -538,6 +542,7 @@
     if (TailDupPlacement)
       AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
     AU.addRequired<TargetPassConfig>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -555,6 +560,7 @@
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
                     "Branch Probability Basic Block Placement", false, false)
 
@@ -2026,7 +2032,10 @@
   // i.e. when the layout predecessor does not fallthrough to the loop header.
   // In practice this never happens though: there always seems to be a preheader
   // that can fallthrough and that is also placed before the header.
-  if (F->getFunction().hasOptSize())
+  bool OptForSize = F->getFunction().hasOptSize() ||
+                    llvm::shouldOptimizeForSize(L.getHeader(), PSI,
+                                                &MBFI->getMBFI());
+  if (OptForSize)
     return L.getHeader();
 
   MachineBasicBlock *OldTop = nullptr;
@@ -2782,6 +2791,11 @@
     if (Freq < (LoopHeaderFreq * ColdProb))
       continue;
 
+    // If the global profiles indicates so, don't align it.
+    if (llvm::shouldOptimizeForSize(ChainBB, PSI, &MBFI->getMBFI()) &&
+        !TLI->alignLoopsWithOptSize())
+      continue;
+
     // Check for the existence of a non-layout predecessor which would benefit
     // from aligning this block.
     MachineBasicBlock *LayoutPred =
@@ -2989,6 +3003,7 @@
   TII = MF.getSubtarget().getInstrInfo();
   TLI = MF.getSubtarget().getTargetLowering();
   MPDT = nullptr;
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
 
   // Initialize PreferredLoopExit to nullptr here since it may never be set if
   // there are no MachineLoops.
@@ -3019,10 +3034,13 @@
 
   if (allowTailDupPlacement()) {
     MPDT = &getAnalysis<MachinePostDominatorTree>();
-    if (MF.getFunction().hasOptSize())
+    bool OptForSize = MF.getFunction().hasOptSize() ||
+                      llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
+    if (OptForSize)
       TailDupSize = 1;
     bool PreRegAlloc = false;
-    TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize);
+    TailDup.initMF(MF, PreRegAlloc, MBPI, &MBFI->getMBFI(), PSI,
+                   /* LayoutMode */ true, TailDupSize);
     precomputeTriangleChains();
   }
 
@@ -3038,7 +3056,7 @@
   if (MF.size() > 3 && EnableTailMerge) {
     unsigned TailMergeSize = TailDupSize + 1;
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
-                    *MBPI, TailMergeSize);
+                    *MBPI, PSI, TailMergeSize);
 
     auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
     if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -12,11 +12,14 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -67,6 +70,8 @@
   MachineLoopInfo *MLI; // Current MachineLoopInfo
   MachineTraceMetrics *Traces;
   MachineTraceMetrics::Ensemble *MinInstr;
+  MachineBlockFrequencyInfo *MBFI;
+  ProfileSummaryInfo *PSI;
 
   TargetSchedModel TSchedModel;
 
@@ -83,7 +88,7 @@
   StringRef getPassName() const override { return "Machine InstCombiner"; }
 
 private:
-  bool doSubstitute(unsigned NewSize, unsigned OldSize);
+  bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize);
   bool combineInstructions(MachineBasicBlock *);
   MachineInstr *getOperandDef(const MachineOperand &MO);
   unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -132,6 +137,8 @@
   AU.addPreserved<MachineLoopInfo>();
   AU.addRequired<MachineTraceMetrics>();
   AU.addPreserved<MachineTraceMetrics>();
+  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -409,8 +416,9 @@
 
 /// \returns true when new instruction sequence should be generated
 /// independent if it lengthens critical path or not
-bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
-  if (OptSize && (NewSize < OldSize))
+bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize,
+                                   bool OptForSize) {
+  if (OptForSize && (NewSize < OldSize))
     return true;
   if (!TSchedModel.hasInstrSchedModelOrItineraries())
     return true;
@@ -508,6 +516,8 @@
   SparseSet<LiveRegUnit> RegUnits;
   RegUnits.setUniverse(TRI->getNumRegUnits());
 
+  bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
+
   while (BlockIter != MBB->end()) {
     auto &MI = *BlockIter++;
     SmallVector<MachineCombinerPattern, 16> Patterns;
@@ -584,7 +594,8 @@
       // fewer instructions OR
       // the new sequence neither lengthens the critical path nor increases
       // resource pressure.
-      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) {
+      if (SubstituteAlways ||
+          doSubstitute(NewInstCount, OldInstCount, OptForSize)) {
         insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
                                  RegUnits, IncrementalUpdate);
         // Eagerly stop after the first pattern fires.
@@ -639,6 +650,10 @@
   MRI = &MF.getRegInfo();
   MLI = &getAnalysis<MachineLoopInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MBFI = (PSI && PSI->hasProfileSummary()) ?
+         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+         nullptr;
   MinInstr = nullptr;
   OptSize = MF.getFunction().hasOptSize();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -27,8 +27,10 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/FastISel.h"
@@ -334,6 +336,8 @@
   AU.addRequired<TargetTransformInfoWrapperPass>();
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     AU.addRequired<BranchProbabilityInfoWrapperPass>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -436,14 +440,17 @@
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+              &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
+              nullptr;
 
   LLVM_DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function &>(Fn), DT, LI);
 
   CurDAG->init(*MF, *ORE, this, LibInfo,
-               getAnalysisIfAvailable<LegacyDivergenceAnalysis>(),
-               nullptr, nullptr);
+               getAnalysisIfAvailable<LegacyDivergenceAnalysis>(), PSI, BFI);
   FuncInfo->set(Fn, *MF, CurDAG);
   SwiftError->setFunction(*MF);
 
diff --git a/llvm/lib/CodeGen/TailDuplication.cpp b/llvm/lib/CodeGen/TailDuplication.cpp
--- a/llvm/lib/CodeGen/TailDuplication.cpp
+++ b/llvm/lib/CodeGen/TailDuplication.cpp
@@ -12,6 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -38,6 +40,8 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -75,7 +79,11 @@
     return false;
 
   auto MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
-  Duplicator.initMF(MF, PreRegAlloc, MBPI, /*LayoutMode=*/false);
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
+               &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+               nullptr;
+  Duplicator.initMF(MF, PreRegAlloc, MBPI, MBFI, PSI, /*LayoutMode=*/false);
 
   bool MadeChange = false;
   while (Duplicator.tailDuplicateBlocks())
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -19,13 +19,16 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -77,6 +80,8 @@
 
 void TailDuplicator::initMF(MachineFunction &MFin, bool PreRegAlloc,
                             const MachineBranchProbabilityInfo *MBPIin,
+                            const MachineBlockFrequencyInfo *MBFIin,
+                            ProfileSummaryInfo *PSIin,
                             bool LayoutModeIn, unsigned TailDupSizeIn) {
   MF = &MFin;
   TII = MF->getSubtarget().getInstrInfo();
@@ -84,6 +89,8 @@
   MRI = &MF->getRegInfo();
   MMI = &MF->getMMI();
   MBPI = MBPIin;
+  MBFI = MBFIin;
+  PSI = PSIin;
   TailDupSize = TailDupSizeIn;
 
   assert(MBPI != nullptr && "Machine Branch Probability Info required");
@@ -555,14 +562,14 @@
   // duplicate only one, because one branch instruction can be eliminated to
   // compensate for the duplication.
   unsigned MaxDuplicateCount;
-  if (TailDupSize == 0 &&
-      TailDuplicateSize.getNumOccurrences() == 0 &&
-      MF->getFunction().hasOptSize())
-    MaxDuplicateCount = 1;
-  else if (TailDupSize == 0)
+  bool OptForSize = MF->getFunction().hasOptSize() ||
+                    llvm::shouldOptimizeForSize(&TailBB, PSI, MBFI);
+  if (TailDupSize == 0)
     MaxDuplicateCount = TailDuplicateSize;
   else
     MaxDuplicateCount = TailDupSize;
+  if (OptForSize)
+    MaxDuplicateCount = 1;
 
   // If the block to be duplicated ends in an unanalyzable fallthrough, don't
   // duplicate it.
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -48,11 +48,14 @@
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
@@ -113,6 +116,8 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
                                        // guide some heuristics.
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -140,6 +145,9 @@
 
   /// Register Liveness information after the current instruction.
   LivePhysRegs LiveRegs;
+
+  ProfileSummaryInfo *PSI;
+  MachineBlockFrequencyInfo *MBFI;
 };
 char FixupBWInstPass::ID = 0;
 }
@@ -154,8 +162,11 @@
 
   this->MF = &MF;
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
-  OptForSize = MF.getFunction().hasOptSize();
   MLI = &getAnalysis<MachineLoopInfo>();
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MBFI = (PSI && PSI->hasProfileSummary()) ?
+         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+         nullptr;
   LiveRegs.init(TII->getRegisterInfo());
 
   LLVM_DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
@@ -426,6 +437,9 @@
   // We run after PEI, so we need to AddPristinesAndCSRs.
   LiveRegs.addLiveOuts(MBB);
 
+  OptForSize = MF.getFunction().hasOptSize() ||
+               llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
+
   for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
     MachineInstr *MI = &*I;
 
diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
--- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -25,6 +25,8 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -32,6 +34,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -247,6 +250,12 @@
 
   static char ID;
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
 private:
   using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>;
 
@@ -681,6 +690,11 @@
   MRI = &MF.getRegInfo();
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
   TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+  auto *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
+               &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+               nullptr;
 
   // Process all basic blocks.
   for (auto &MBB : MF) {
@@ -699,7 +713,9 @@
 
     // Remove redundant address calculations. Do it only for -Os/-Oz since only
     // a code size gain is expected from this part of the pass.
-    if (MF.getFunction().hasOptSize())
+    bool OptForSize = MF.getFunction().hasOptSize() ||
+                      llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
+    if (OptForSize)
       Changed |= removeRedundantAddrCalc(LEAs);
   }
 
diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp
--- a/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -17,8 +17,11 @@
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/Function.h"
@@ -52,6 +55,12 @@
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<ProfileSummaryInfoWrapperPass>();
+      AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
     MachineFunctionProperties getRequiredProperties() const override {
       return MachineFunctionProperties().set(
           MachineFunctionProperties::Property::NoVRegs);
@@ -105,6 +114,12 @@
 
   TSM.init(&MF.getSubtarget());
 
+  auto *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
+               &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+               nullptr;
+
   // Search through basic blocks and mark the ones that have early returns
   ReturnBBs.clear();
   VisitedBBs.clear();
@@ -118,6 +133,11 @@
     MachineBasicBlock *MBB = I->first;
     unsigned Cycles = I->second;
 
+    // Function::hasOptSize is already checked above.
+    bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
+    if (OptForSize)
+      continue;
+
     if (Cycles < Threshold) {
       // BB ends in a return. Skip over any DBG_VALUE instructions
       // trailing the terminator.
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -11,6 +11,7 @@
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
@@ -45,6 +46,10 @@
 ; CHECK-NEXT:       Analysis for ComputingKnownBits
 ; CHECK-NEXT:       InstructionSelect
 ; CHECK-NEXT:       ResetMachineFunction
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       AArch64 Instruction Selection
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
 ; CHECK-NEXT:       Local Stack Slot Allocation
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -10,8 +10,8 @@
 ; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
-; CHECK-NEXT: Create Garbage Collector Module Metadata
 ; CHECK-NEXT: Profile summary info
+; CHECK-NEXT: Create Garbage Collector Module Metadata
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
@@ -35,6 +35,9 @@
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:       Merge contiguous icmps into a memcmp
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
@@ -78,10 +81,13 @@
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       AArch64 Instruction Selection
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       AArch64 Local Dynamic TLS Access Clean-up
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Early Tail Duplication
 ; CHECK-NEXT:       Optimize machine instruction PHIs
 ; CHECK-NEXT:       Slot index numbering
@@ -93,6 +99,7 @@
 ; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Machine Trace Metrics
 ; CHECK-NEXT:       AArch64 Conditional Compares
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine InstCombiner
 ; CHECK-NEXT:       AArch64 Conditional Branch Tuning
 ; CHECK-NEXT:       Machine Trace Metrics
@@ -149,6 +156,7 @@
 ; CHECK-NEXT:       Shrink Wrapping analysis
 ; CHECK-NEXT:       Prologue/Epilogue Insertion & Frame Finalization
 ; CHECK-NEXT:       Control Flow Optimizer
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Tail Duplication
 ; CHECK-NEXT:       Machine Copy Propagation Pass
 ; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll b/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll
@@ -0,0 +1,128 @@
+; RUN: llc %s -enable-machine-outliner=never -mtriple=arm64-linux-gnu -o - | \
+; RUN:   FileCheck --check-prefixes=CHECK,CHECK-LINUX %s
+; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
+
+; CHECK-LABEL: fct1:
+; For small size (<= 256), we do not change memset to bzero.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct1(i8* nocapture %ptr) !prof !14 {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
+
+; CHECK-LABEL: fct2:
+; When the size is bigger than 256, change into bzero.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct2(i8* nocapture %ptr) !prof !14 {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: fct3:
+; For unknown size, change to bzero.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct3(i8* nocapture %ptr, i32 %unknown) !prof !14 {
+entry:
+  %conv = sext i32 %unknown to i64
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: fct4:
+; Size <= 256, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct4(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
+
+declare i64 @llvm.objectsize.i64(i8*, i1)
+
+; CHECK-LABEL: fct5:
+; Size > 256, change.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct5(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK-LABEL: fct6:
+; Size = unknown, change.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct6(i8* %ptr, i32 %unknown) !prof !14 {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp)
+  ret void
+}
+
+; Next functions check that memset is not turned into bzero
+; when the set constant is non-zero, whatever the given size.
+
+; CHECK-LABEL: fct7:
+; memset with something that is not a zero, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct7(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
+  ret void
+}
+
+; CHECK-LABEL: fct8:
+; memset with something that is not a zero, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct8(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK-LABEL: fct9:
+; memset with something that is not a zero, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct9(i8* %ptr, i32 %unknown) !prof !14 {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp)
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/AArch64/max-jump-table.ll b/llvm/test/CodeGen/AArch64/max-jump-table.ll
--- a/llvm/test/CodeGen/AArch64/max-jump-table.ll
+++ b/llvm/test/CodeGen/AArch64/max-jump-table.ll
@@ -203,3 +203,136 @@
 
 return: ret void
 }
+
+define i32 @jt1_optsize(i32 %a, i32 %b) optsize {
+entry:
+  switch i32 %a, label %return [
+    i32 1,  label %bb1
+    i32 2,  label %bb2
+    i32 3,  label %bb3
+    i32 4,  label %bb4
+    i32 5,  label %bb5
+    i32 6,  label %bb6
+    i32 7,  label %bb7
+    i32 8,  label %bb8
+    i32 9,  label %bb9
+    i32 10, label %bb10
+    i32 11, label %bb11
+    i32 12, label %bb12
+    i32 13, label %bb13
+    i32 14, label %bb14
+    i32 15, label %bb15
+    i32 16, label %bb16
+    i32 17, label %bb17
+  ]
+; CHECK-LABEL: function jt1_optsize:
+; CHECK-NEXT: Jump Tables:
+; CHECK0-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK0-NOT:   %jump-table.1:
+; CHECK4-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK4-NOT:   %jump-table.1:
+; CHECK8-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK8-NOT:   %jump-table.1:
+; CHECK16-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK16-NOT:  %jump-table.1:
+; CHECKM1-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM1-NOT:  %jump-table.1:
+; CHECKM3-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM3-NOT:  %jump-table.1:
+; CHECK-DAG: End machine code for function jt1_optsize.
+
+bb1:  tail call void @ext(i32 1, i32 0)  br label %return
+bb2:  tail call void @ext(i32 2, i32 2)  br label %return
+bb3:  tail call void @ext(i32 3, i32 4)  br label %return
+bb4:  tail call void @ext(i32 4, i32 6)  br label %return
+bb5:  tail call void @ext(i32 5, i32 8)  br label %return
+bb6:  tail call void @ext(i32 6, i32 10) br label %return
+bb7:  tail call void @ext(i32 7, i32 12) br label %return
+bb8:  tail call void @ext(i32 8, i32 14) br label %return
+bb9:  tail call void @ext(i32 9, i32 16) br label %return
+bb10: tail call void @ext(i32 1, i32 18) br label %return
+bb11: tail call void @ext(i32 2, i32 20) br label %return
+bb12: tail call void @ext(i32 3, i32 22) br label %return
+bb13: tail call void @ext(i32 4, i32 24) br label %return
+bb14: tail call void @ext(i32 5, i32 26) br label %return
+bb15: tail call void @ext(i32 6, i32 28) br label %return
+bb16: tail call void @ext(i32 7, i32 30) br label %return
+bb17: tail call void @ext(i32 8, i32 32) br label %return
+
+return: ret i32 %b
+}
+
+define i32 @jt1_pgso(i32 %a, i32 %b) !prof !14 {
+entry:
+  switch i32 %a, label %return [
+    i32 1,  label %bb1
+    i32 2,  label %bb2
+    i32 3,  label %bb3
+    i32 4,  label %bb4
+    i32 5,  label %bb5
+    i32 6,  label %bb6
+    i32 7,  label %bb7
+    i32 8,  label %bb8
+    i32 9,  label %bb9
+    i32 10, label %bb10
+    i32 11, label %bb11
+    i32 12, label %bb12
+    i32 13, label %bb13
+    i32 14, label %bb14
+    i32 15, label %bb15
+    i32 16, label %bb16
+    i32 17, label %bb17
+  ]
+; CHECK-LABEL: function jt1_pgso:
+; CHECK-NEXT: Jump Tables:
+; CHECK0-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK0-NOT:   %jump-table.1:
+; CHECK4-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK4-NOT:   %jump-table.1:
+; CHECK8-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK8-NOT:   %jump-table.1:
+; CHECK16-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK16-NOT:  %jump-table.1:
+; CHECKM1-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM1-NOT:  %jump-table.1:
+; CHECKM3-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM3-NOT:  %jump-table.1:
+; CHECK-DAG: End machine code for function jt1_pgso.
+
+bb1:  tail call void @ext(i32 1, i32 0)  br label %return
+bb2:  tail call void @ext(i32 2, i32 2)  br label %return
+bb3:  tail call void @ext(i32 3, i32 4)  br label %return
+bb4:  tail call void @ext(i32 4, i32 6)  br label %return
+bb5:  tail call void @ext(i32 5, i32 8)  br label %return
+bb6:  tail call void @ext(i32 6, i32 10) br label %return
+bb7:  tail call void @ext(i32 7, i32 12) br label %return
+bb8:  tail call void @ext(i32 8, i32 14) br label %return
+bb9:  tail call void @ext(i32 9, i32 16) br label %return
+bb10: tail call void @ext(i32 1, i32 18) br label %return
+bb11: tail call void @ext(i32 2, i32 20) br label %return
+bb12: tail call void @ext(i32 3, i32 22) br label %return
+bb13: tail call void @ext(i32 4, i32 24) br label %return
+bb14: tail call void @ext(i32 5, i32 26) br label %return
+bb15: tail call void @ext(i32 6, i32 28) br label %return
+bb16: tail call void @ext(i32 7, i32 30) br label %return
+bb17: tail call void @ext(i32 8, i32 32) br label %return
+
+return: ret i32 %b
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -19,6 +19,9 @@
 ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:      Function Alias Analysis Results
 ; CHECK-NEXT:      Merge contiguous icmps into a memcmp
+; CHECK-NEXT:      Natural Loop Information
+; CHECK-NEXT:      Lazy Branch Probability Analysis
+; CHECK-NEXT:      Lazy Block Frequency Analysis
 ; CHECK-NEXT:      Expand memcmp() to load/stores
 ; CHECK-NEXT:      Lower Garbage Collection Instructions
 ; CHECK-NEXT:      Shadow Stack GC Lowering
@@ -67,8 +70,11 @@
 ; CHECK-NEXT:      Function Alias Analysis Results
 ; CHECK-NEXT:      Natural Loop Information
 ; CHECK-NEXT:      Branch Probability Analysis
+; CHECK-NEXT:      Lazy Branch Probability Analysis
+; CHECK-NEXT:      Lazy Block Frequency Analysis
 ; CHECK-NEXT:      ARM Instruction Selection
 ; CHECK-NEXT:      Finalize ISel and expand pseudo-instructions
+; CHECK-NEXT:      Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:      Early Tail Duplication
 ; CHECK-NEXT:      Optimize machine instruction PHIs
 ; CHECK-NEXT:      Slot index numbering
@@ -124,6 +130,7 @@
 ; CHECK-NEXT:      Shrink Wrapping analysis
 ; CHECK-NEXT:      Prologue/Epilogue Insertion & Frame Finalization
 ; CHECK-NEXT:      Control Flow Optimizer
+; CHECK-NEXT:      Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:      Tail Duplication
 ; CHECK-NEXT:      Machine Copy Propagation Pass
 ; CHECK-NEXT:      Post-RA pseudo instruction expansion pass
diff --git a/llvm/test/CodeGen/ARM/constantpool-align.ll b/llvm/test/CodeGen/ARM/constantpool-align.ll
--- a/llvm/test/CodeGen/ARM/constantpool-align.ll
+++ b/llvm/test/CodeGen/ARM/constantpool-align.ll
@@ -17,3 +17,28 @@
   store <4 x i32> <i32 -1, i32 0, i32 0, i32 -1>, <4 x i32>* %p, align 4
   ret void 
 }
+
+; CHECK-LABEL: f_pgso:
+; CHECK: vld1.64 {{.*}}, [r1]
+; CHECK: .p2align 3
+define void @f_pgso(<4 x i32>* %p) !prof !14 {
+  store <4 x i32> <i32 -1, i32 0, i32 0, i32 -1>, <4 x i32>* %p, align 4
+  ret void 
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -23,6 +23,17 @@
   ret void
 }
 
+; Perform tail call optimization for external symbol.
+@dest_pgso = global [2 x i8] zeroinitializer
+define void @caller_extern_pgso(i8* %src) !prof !14 {
+entry:
+; CHECK: caller_extern_pgso
+; CHECK-NOT: call memcpy
+; CHECK: tail memcpy
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @dest_pgso, i32 0, i32 0), i8* %src, i32 7, i1 false)
+  ret void
+}
+
 ; Perform indirect tail call optimization (for function pointer call).
 declare void @callee_indirect1()
 declare void @callee_indirect2()
@@ -146,3 +157,20 @@
   tail call void @callee_nostruct()
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -14,6 +14,7 @@
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
@@ -37,6 +38,10 @@
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       X86 DAG->DAG Instruction Selection
 ; CHECK-NEXT:       X86 PIC Global Base Reg Initialization
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
diff --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll
--- a/llvm/test/CodeGen/X86/O3-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O3-pipeline.ll
@@ -13,8 +13,8 @@
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
-; CHECK-NEXT: Create Garbage Collector Module Metadata
 ; CHECK-NEXT: Profile summary info
+; CHECK-NEXT: Create Garbage Collector Module Metadata
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
@@ -32,6 +32,9 @@
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:       Merge contiguous icmps into a memcmp
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
@@ -64,12 +67,15 @@
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       X86 DAG->DAG Instruction Selection
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Local Dynamic TLS Access Clean-up
 ; CHECK-NEXT:       X86 PIC Global Base Reg Initialization
 ; CHECK-NEXT:        Finalize ISel and expand pseudo-instructions
 ; CHECK-NEXT:       X86 Domain Reassignment Pass
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Early Tail Duplication
 ; CHECK-NEXT:       Optimize machine instruction PHIs
 ; CHECK-NEXT:       Slot index numbering
@@ -80,6 +86,7 @@
 ; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Machine Trace Metrics
 ; CHECK-NEXT:       Early If-Conversion
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine InstCombiner
 ; CHECK-NEXT:       X86 cmov Conversion
 ; CHECK-NEXT:       MachineDominator Tree Construction
@@ -94,6 +101,7 @@
 ; CHECK-NEXT:       Remove dead machine instructions
 ; CHECK-NEXT:       Live Range Shrink
 ; CHECK-NEXT:       X86 Fixup SetCC
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 LEA Optimize
 ; CHECK-NEXT:       X86 Optimize Call Frame
 ; CHECK-NEXT:       X86 Avoid Store Forwarding Block
@@ -139,6 +147,7 @@
 ; CHECK-NEXT:       Shrink Wrapping analysis
 ; CHECK-NEXT:       Prologue/Epilogue Insertion & Frame Finalization
 ; CHECK-NEXT:       Control Flow Optimizer
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Tail Duplication
 ; CHECK-NEXT:       Machine Copy Propagation Pass
 ; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
@@ -157,7 +166,9 @@
 ; CHECK-NEXT:       X86 vzeroupper inserter
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Machine Natural Loop Construction
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 Byte/Word Instruction Fixup
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 Atom pad short functions
 ; CHECK-NEXT:       X86 LEA Fixup
 ; CHECK-NEXT:       Compressing EVEX instrs to VEX encoding when possible
diff --git a/llvm/test/CodeGen/X86/atom-pad-short-functions.ll b/llvm/test/CodeGen/X86/atom-pad-short-functions.ll
--- a/llvm/test/CodeGen/X86/atom-pad-short-functions.ll
+++ b/llvm/test/CodeGen/X86/atom-pad-short-functions.ll
@@ -29,6 +29,13 @@
   ret i32 %a
 }
 
+define i32 @test_pgso(i32 %a) nounwind !prof !14 {
+; CHECK: test_pgso
+; CHECK: movl
+; CHECK-NEXT: ret
+  ret i32 %a
+}
+
 define i32 @test_add(i32 %a, i32 %b) nounwind {
 ; CHECK: test_add
 ; CHECK: addl
@@ -101,3 +108,19 @@
   ret void
 }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/avx-cvt.ll b/llvm/test/CodeGen/X86/avx-cvt.ll
--- a/llvm/test/CodeGen/X86/avx-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx-cvt.ll
@@ -190,6 +190,16 @@
   ret float %res
 }
 
+define float @floor_f32_load_pgso(float* %aptr) !prof !14 {
+; CHECK-LABEL: floor_f32_load_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vroundss $9, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = load float, float* %aptr
+  %res = call float @llvm.floor.f32(float %a)
+  ret float %res
+}
+
 define double @nearbyint_f64_load(double* %aptr) optsize {
 ; CHECK-LABEL: nearbyint_f64_load:
 ; CHECK:       # %bb.0:
@@ -200,3 +210,29 @@
   ret double %res
 }
 
+define double @nearbyint_f64_load_pgso(double* %aptr) !prof !14 {
+; CHECK-LABEL: nearbyint_f64_load_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vroundsd $12, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = load double, double* %aptr
+  %res = call double @llvm.nearbyint.f64(double %a)
+  ret double %res
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1970,6 +1970,47 @@
   ret <32 x i16> %ret
 }
 
+define <32 x i16> @test_build_vec_v32i1_pgso(<32 x i16> %x) !prof !14 {
+; KNL-LABEL: test_build_vec_v32i1_pgso:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; KNL-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_build_vec_v32i1_pgso:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    movl $1497715861, %eax ## imm = 0x59455495
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_build_vec_v32i1_pgso:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    movl $1497715861, %eax ## imm = 0x59455495
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_build_vec_v32i1_pgso:
+; AVX512DQ:       ## %bb.0:
+; AVX512DQ-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; X86-LABEL: test_build_vec_v32i1_pgso:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl $1497715861, %eax ## imm = 0x59455495
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
 define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
 ; KNL-LABEL: test_build_vec_v64i1:
 ; KNL:       ## %bb.0:
@@ -2013,12 +2054,12 @@
 ; KNL-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
-; KNL-NEXT:    je LBB43_2
+; KNL-NEXT:    je LBB44_2
 ; KNL-NEXT:  ## %bb.1: ## %L1
 ; KNL-NEXT:    vmovapd %zmm0, (%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB43_2: ## %L2
+; KNL-NEXT:  LBB44_2: ## %L2
 ; KNL-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -2029,12 +2070,12 @@
 ; SKX-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
 ; SKX-NEXT:    vcmpltpd %zmm1, %zmm0, %k0
 ; SKX-NEXT:    ktestb %k0, %k1
-; SKX-NEXT:    je LBB43_2
+; SKX-NEXT:    je LBB44_2
 ; SKX-NEXT:  ## %bb.1: ## %L1
 ; SKX-NEXT:    vmovapd %zmm0, (%rdi)
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB43_2: ## %L2
+; SKX-NEXT:  LBB44_2: ## %L2
 ; SKX-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -2046,12 +2087,12 @@
 ; AVX512BW-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb %al, %al
-; AVX512BW-NEXT:    je LBB43_2
+; AVX512BW-NEXT:    je LBB44_2
 ; AVX512BW-NEXT:  ## %bb.1: ## %L1
 ; AVX512BW-NEXT:    vmovapd %zmm0, (%rdi)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB43_2: ## %L2
+; AVX512BW-NEXT:  LBB44_2: ## %L2
 ; AVX512BW-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2062,12 +2103,12 @@
 ; AVX512DQ-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
 ; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm0, %k0
 ; AVX512DQ-NEXT:    ktestb %k0, %k1
-; AVX512DQ-NEXT:    je LBB43_2
+; AVX512DQ-NEXT:    je LBB44_2
 ; AVX512DQ-NEXT:  ## %bb.1: ## %L1
 ; AVX512DQ-NEXT:    vmovapd %zmm0, (%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB43_2: ## %L2
+; AVX512DQ-NEXT:  LBB44_2: ## %L2
 ; AVX512DQ-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -2079,12 +2120,12 @@
 ; X86-NEXT:    vmovupd 8(%eax), %zmm1 {%k1} {z}
 ; X86-NEXT:    vcmpltpd %zmm1, %zmm0, %k0
 ; X86-NEXT:    ktestb %k0, %k1
-; X86-NEXT:    je LBB43_2
+; X86-NEXT:    je LBB44_2
 ; X86-NEXT:  ## %bb.1: ## %L1
 ; X86-NEXT:    vmovapd %zmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB43_2: ## %L2
+; X86-NEXT:  LBB44_2: ## %L2
 ; X86-NEXT:    vmovapd %zmm0, 8(%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -2131,13 +2172,13 @@
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    orl %eax, %ecx
-; KNL-NEXT:    je LBB44_2
+; KNL-NEXT:    je LBB45_2
 ; KNL-NEXT:  ## %bb.1: ## %L1
 ; KNL-NEXT:    vmovaps %zmm0, (%rdi)
 ; KNL-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB44_2: ## %L2
+; KNL-NEXT:  LBB45_2: ## %L2
 ; KNL-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; KNL-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; KNL-NEXT:    vzeroupper
@@ -2154,13 +2195,13 @@
 ; SKX-NEXT:    vcmpltps %zmm2, %zmm1, %k2
 ; SKX-NEXT:    kunpckwd %k1, %k2, %k1
 ; SKX-NEXT:    kortestd %k1, %k0
-; SKX-NEXT:    je LBB44_2
+; SKX-NEXT:    je LBB45_2
 ; SKX-NEXT:  ## %bb.1: ## %L1
 ; SKX-NEXT:    vmovaps %zmm0, (%rdi)
 ; SKX-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB44_2: ## %L2
+; SKX-NEXT:  LBB45_2: ## %L2
 ; SKX-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; SKX-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; SKX-NEXT:    vzeroupper
@@ -2177,13 +2218,13 @@
 ; AVX512BW-NEXT:    vcmpltps %zmm2, %zmm1, %k2
 ; AVX512BW-NEXT:    kunpckwd %k1, %k2, %k1
 ; AVX512BW-NEXT:    kortestd %k1, %k0
-; AVX512BW-NEXT:    je LBB44_2
+; AVX512BW-NEXT:    je LBB45_2
 ; AVX512BW-NEXT:  ## %bb.1: ## %L1
 ; AVX512BW-NEXT:    vmovaps %zmm0, (%rdi)
 ; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB44_2: ## %L2
+; AVX512BW-NEXT:  LBB45_2: ## %L2
 ; AVX512BW-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; AVX512BW-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; AVX512BW-NEXT:    vzeroupper
@@ -2203,13 +2244,13 @@
 ; AVX512DQ-NEXT:    kmovw %k0, %ecx
 ; AVX512DQ-NEXT:    shll $16, %ecx
 ; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    je LBB44_2
+; AVX512DQ-NEXT:    je LBB45_2
 ; AVX512DQ-NEXT:  ## %bb.1: ## %L1
 ; AVX512DQ-NEXT:    vmovaps %zmm0, (%rdi)
 ; AVX512DQ-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB44_2: ## %L2
+; AVX512DQ-NEXT:  LBB45_2: ## %L2
 ; AVX512DQ-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; AVX512DQ-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -2227,13 +2268,13 @@
 ; X86-NEXT:    vcmpltps %zmm2, %zmm1, %k2
 ; X86-NEXT:    kunpckwd %k1, %k2, %k1
 ; X86-NEXT:    kortestd %k1, %k0
-; X86-NEXT:    je LBB44_2
+; X86-NEXT:    je LBB45_2
 ; X86-NEXT:  ## %bb.1: ## %L1
 ; X86-NEXT:    vmovaps %zmm0, (%eax)
 ; X86-NEXT:    vmovaps %zmm1, 64(%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB44_2: ## %L2
+; X86-NEXT:  LBB45_2: ## %L2
 ; X86-NEXT:    vmovaps %zmm0, 4(%eax)
 ; X86-NEXT:    vmovaps %zmm1, 68(%eax)
 ; X86-NEXT:    vzeroupper
@@ -4188,12 +4229,12 @@
 ; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testw %ax, %ax
-; KNL-NEXT:    jle LBB65_1
+; KNL-NEXT:    jle LBB66_1
 ; KNL-NEXT:  ## %bb.2: ## %bb.2
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB65_1: ## %bb.1
+; KNL-NEXT:  LBB66_1: ## %bb.1
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4207,12 +4248,12 @@
 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    testw %ax, %ax
-; SKX-NEXT:    jle LBB65_1
+; SKX-NEXT:    jle LBB66_1
 ; SKX-NEXT:  ## %bb.2: ## %bb.2
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB65_1: ## %bb.1
+; SKX-NEXT:  LBB66_1: ## %bb.1
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4226,12 +4267,12 @@
 ; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testw %ax, %ax
-; AVX512BW-NEXT:    jle LBB65_1
+; AVX512BW-NEXT:    jle LBB66_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %bb.2
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB65_1: ## %bb.1
+; AVX512BW-NEXT:  LBB66_1: ## %bb.1
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4245,12 +4286,12 @@
 ; AVX512DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, %eax
 ; AVX512DQ-NEXT:    testw %ax, %ax
-; AVX512DQ-NEXT:    jle LBB65_1
+; AVX512DQ-NEXT:    jle LBB66_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %bb.2
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB65_1: ## %bb.1
+; AVX512DQ-NEXT:  LBB66_1: ## %bb.1
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4264,12 +4305,12 @@
 ; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; X86-NEXT:    kmovd %k0, %eax
 ; X86-NEXT:    testw %ax, %ax
-; X86-NEXT:    jle LBB65_1
+; X86-NEXT:    jle LBB66_1
 ; X86-NEXT:  ## %bb.2: ## %bb.2
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB65_1: ## %bb.1
+; X86-NEXT:  LBB66_1: ## %bb.1
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4297,11 +4338,11 @@
 ; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; CHECK-NEXT:    kortestw %k0, %k0
-; CHECK-NEXT:    jb LBB66_2
+; CHECK-NEXT:    jb LBB67_2
 ; CHECK-NEXT:  ## %bb.1: ## %bb.1
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _foo
-; CHECK-NEXT:  LBB66_2: ## %bb.2
+; CHECK-NEXT:  LBB67_2: ## %bb.2
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -4313,11 +4354,11 @@
 ; X86-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; X86-NEXT:    kortestw %k0, %k0
-; X86-NEXT:    jb LBB66_2
+; X86-NEXT:    jb LBB67_2
 ; X86-NEXT:  ## %bb.1: ## %bb.1
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
-; X86-NEXT:  LBB66_2: ## %bb.2
+; X86-NEXT:  LBB67_2: ## %bb.2
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -4505,12 +4546,12 @@
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
-; KNL-NEXT:    je LBB72_1
+; KNL-NEXT:    je LBB73_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB72_1: ## %bar
+; KNL-NEXT:  LBB73_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4527,12 +4568,12 @@
 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k2
 ; SKX-NEXT:    korb %k2, %k1, %k1
 ; SKX-NEXT:    ktestb %k1, %k0
-; SKX-NEXT:    je LBB72_1
+; SKX-NEXT:    je LBB73_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB72_1: ## %bar
+; SKX-NEXT:  LBB73_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4555,12 +4596,12 @@
 ; AVX512BW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb %al, %al
-; AVX512BW-NEXT:    je LBB72_1
+; AVX512BW-NEXT:    je LBB73_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB72_1: ## %bar
+; AVX512BW-NEXT:  LBB73_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4581,12 +4622,12 @@
 ; AVX512DQ-NEXT:    korb %k1, %k0, %k0
 ; AVX512DQ-NEXT:    korb %k3, %k2, %k1
 ; AVX512DQ-NEXT:    ktestb %k1, %k0
-; AVX512DQ-NEXT:    je LBB72_1
+; AVX512DQ-NEXT:    je LBB73_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB72_1: ## %bar
+; AVX512DQ-NEXT:  LBB73_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4603,12 +4644,12 @@
 ; X86-NEXT:    vptestnmd %ymm3, %ymm3, %k2
 ; X86-NEXT:    korb %k2, %k1, %k1
 ; X86-NEXT:    ktestb %k1, %k0
-; X86-NEXT:    je LBB72_1
+; X86-NEXT:    je LBB73_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB72_1: ## %bar
+; X86-NEXT:  LBB73_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4646,12 +4687,12 @@
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
-; KNL-NEXT:    je LBB73_1
+; KNL-NEXT:    je LBB74_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB73_1: ## %bar
+; KNL-NEXT:  LBB74_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4668,12 +4709,12 @@
 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k2
 ; SKX-NEXT:    korb %k2, %k1, %k1
 ; SKX-NEXT:    ktestb %k1, %k0
-; SKX-NEXT:    je LBB73_1
+; SKX-NEXT:    je LBB74_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB73_1: ## %bar
+; SKX-NEXT:  LBB74_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4692,12 +4733,12 @@
 ; AVX512BW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb %al, %al
-; AVX512BW-NEXT:    je LBB73_1
+; AVX512BW-NEXT:    je LBB74_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB73_1: ## %bar
+; AVX512BW-NEXT:  LBB74_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4714,12 +4755,12 @@
 ; AVX512DQ-NEXT:    vptestnmq %zmm3, %zmm3, %k2
 ; AVX512DQ-NEXT:    korb %k2, %k1, %k1
 ; AVX512DQ-NEXT:    ktestb %k1, %k0
-; AVX512DQ-NEXT:    je LBB73_1
+; AVX512DQ-NEXT:    je LBB74_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB73_1: ## %bar
+; AVX512DQ-NEXT:  LBB74_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4736,12 +4777,12 @@
 ; X86-NEXT:    vptestnmq %zmm3, %zmm3, %k2
 ; X86-NEXT:    korb %k2, %k1, %k1
 ; X86-NEXT:    ktestb %k1, %k0
-; X86-NEXT:    je LBB73_1
+; X86-NEXT:    je LBB74_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB73_1: ## %bar
+; X86-NEXT:  LBB74_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4778,12 +4819,12 @@
 ; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kortestw %k0, %k0
-; KNL-NEXT:    je LBB74_1
+; KNL-NEXT:    je LBB75_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB74_1: ## %bar
+; KNL-NEXT:  LBB75_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4800,12 +4841,12 @@
 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k2
 ; SKX-NEXT:    korw %k2, %k1, %k1
 ; SKX-NEXT:    ktestw %k1, %k0
-; SKX-NEXT:    je LBB74_1
+; SKX-NEXT:    je LBB75_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB74_1: ## %bar
+; SKX-NEXT:  LBB75_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4823,12 +4864,12 @@
 ; AVX512BW-NEXT:    korw %k2, %k1, %k1
 ; AVX512BW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kortestw %k0, %k0
-; AVX512BW-NEXT:    je LBB74_1
+; AVX512BW-NEXT:    je LBB75_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB74_1: ## %bar
+; AVX512BW-NEXT:  LBB75_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4845,12 +4886,12 @@
 ; AVX512DQ-NEXT:    vptestnmd %zmm3, %zmm3, %k2
 ; AVX512DQ-NEXT:    korw %k2, %k1, %k1
 ; AVX512DQ-NEXT:    ktestw %k1, %k0
-; AVX512DQ-NEXT:    je LBB74_1
+; AVX512DQ-NEXT:    je LBB75_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB74_1: ## %bar
+; AVX512DQ-NEXT:  LBB75_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4867,12 +4908,12 @@
 ; X86-NEXT:    vptestnmd %zmm3, %zmm3, %k2
 ; X86-NEXT:    korw %k2, %k1, %k1
 ; X86-NEXT:    ktestw %k1, %k0
-; X86-NEXT:    je LBB74_1
+; X86-NEXT:    je LBB75_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB74_1: ## %bar
+; X86-NEXT:  LBB75_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4928,12 +4969,12 @@
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    orl %eax, %ecx
-; KNL-NEXT:    je LBB75_1
+; KNL-NEXT:    je LBB76_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB75_1: ## %bar
+; KNL-NEXT:  LBB76_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4950,12 +4991,12 @@
 ; SKX-NEXT:    vptestnmw %zmm3, %zmm3, %k2
 ; SKX-NEXT:    kord %k2, %k1, %k1
 ; SKX-NEXT:    ktestd %k1, %k0
-; SKX-NEXT:    je LBB75_1
+; SKX-NEXT:    je LBB76_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB75_1: ## %bar
+; SKX-NEXT:  LBB76_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4972,12 +5013,12 @@
 ; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm3, %k2
 ; AVX512BW-NEXT:    kord %k2, %k1, %k1
 ; AVX512BW-NEXT:    ktestd %k1, %k0
-; AVX512BW-NEXT:    je LBB75_1
+; AVX512BW-NEXT:    je LBB76_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB75_1: ## %bar
+; AVX512BW-NEXT:  LBB76_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -5014,12 +5055,12 @@
 ; AVX512DQ-NEXT:    kmovw %k0, %ecx
 ; AVX512DQ-NEXT:    shll $16, %ecx
 ; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    je LBB75_1
+; AVX512DQ-NEXT:    je LBB76_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB75_1: ## %bar
+; AVX512DQ-NEXT:  LBB76_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -5036,12 +5077,12 @@
 ; X86-NEXT:    vptestnmw %zmm3, %zmm3, %k2
 ; X86-NEXT:    kord %k2, %k1, %k1
 ; X86-NEXT:    ktestd %k1, %k0
-; X86-NEXT:    je LBB75_1
+; X86-NEXT:    je LBB76_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB75_1: ## %bar
+; X86-NEXT:  LBB76_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -5121,12 +5162,12 @@
 ; KNL-NEXT:    orl %eax, %edx
 ; KNL-NEXT:    shlq $32, %rdx
 ; KNL-NEXT:    orq %rcx, %rdx
-; KNL-NEXT:    je LBB76_1
+; KNL-NEXT:    je LBB77_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB76_1: ## %bar
+; KNL-NEXT:  LBB77_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -5143,12 +5184,12 @@
 ; SKX-NEXT:    vptestnmb %zmm3, %zmm3, %k2
 ; SKX-NEXT:    korq %k2, %k1, %k1
 ; SKX-NEXT:    ktestq %k1, %k0
-; SKX-NEXT:    je LBB76_1
+; SKX-NEXT:    je LBB77_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB76_1: ## %bar
+; SKX-NEXT:  LBB77_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -5165,12 +5206,12 @@
 ; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm3, %k2
 ; AVX512BW-NEXT:    korq %k2, %k1, %k1
 ; AVX512BW-NEXT:    ktestq %k1, %k0
-; AVX512BW-NEXT:    je LBB76_1
+; AVX512BW-NEXT:    je LBB77_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB76_1: ## %bar
+; AVX512BW-NEXT:  LBB77_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -5231,12 +5272,12 @@
 ; AVX512DQ-NEXT:    orl %eax, %edx
 ; AVX512DQ-NEXT:    shlq $32, %rdx
 ; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    je LBB76_1
+; AVX512DQ-NEXT:    je LBB77_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB76_1: ## %bar
+; AVX512DQ-NEXT:  LBB77_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -5255,12 +5296,12 @@
 ; X86-NEXT:    kandq %k1, %k0, %k0
 ; X86-NEXT:    kshiftrq $32, %k0, %k1
 ; X86-NEXT:    kortestd %k1, %k0
-; X86-NEXT:    je LBB76_1
+; X86-NEXT:    je LBB77_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB76_1: ## %bar
+; X86-NEXT:  LBB77_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -5360,3 +5401,20 @@
   %maskv = insertelement <64 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
   ret <64 x i1> %maskv
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
--- a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
@@ -130,6 +130,24 @@
   ret i64 %div
 }
 
+define i64 @div64_pgso(i64 %a, i64 %b) !prof !15 {
+; CHECK-LABEL: div64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
+; CHECK-NEXT:    retq
+;
+; HUGEWS-LABEL: div64_pgso:
+; HUGEWS:       # %bb.0:
+; HUGEWS-NEXT:    movq %rdi, %rax
+; HUGEWS-NEXT:    cqto
+; HUGEWS-NEXT:    idivq %rsi
+; HUGEWS-NEXT:    retq
+  %div = sdiv i64 %a, %b
+  ret i64 %div
+}
+
 define i64 @div64_hugews(i64 %a, i64 %b) {
 ; ATOM-LABEL: div64_hugews:
 ; ATOM:       # %bb.0:
@@ -137,12 +155,12 @@
 ; ATOM-NEXT:    movq %rdi, %rax
 ; ATOM-NEXT:    orq %rsi, %rcx
 ; ATOM-NEXT:    shrq $32, %rcx
-; ATOM-NEXT:    je .LBB3_1
+; ATOM-NEXT:    je .LBB4_1
 ; ATOM-NEXT:  # %bb.2:
 ; ATOM-NEXT:    cqto
 ; ATOM-NEXT:    idivq %rsi
 ; ATOM-NEXT:    retq
-; ATOM-NEXT:  .LBB3_1:
+; ATOM-NEXT:  .LBB4_1:
 ; ATOM-NEXT:    # kill: def $eax killed $eax killed $rax
 ; ATOM-NEXT:    xorl %edx, %edx
 ; ATOM-NEXT:    divl %esi
@@ -155,12 +173,12 @@
 ; SLM-NEXT:    movq %rdi, %rax
 ; SLM-NEXT:    orq %rsi, %rcx
 ; SLM-NEXT:    shrq $32, %rcx
-; SLM-NEXT:    je .LBB3_1
+; SLM-NEXT:    je .LBB4_1
 ; SLM-NEXT:  # %bb.2:
 ; SLM-NEXT:    cqto
 ; SLM-NEXT:    idivq %rsi
 ; SLM-NEXT:    retq
-; SLM-NEXT:  .LBB3_1:
+; SLM-NEXT:  .LBB4_1:
 ; SLM-NEXT:    xorl %edx, %edx
 ; SLM-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SLM-NEXT:    divl %esi
@@ -173,12 +191,12 @@
 ; SKL-NEXT:    movq %rdi, %rcx
 ; SKL-NEXT:    orq %rsi, %rcx
 ; SKL-NEXT:    shrq $32, %rcx
-; SKL-NEXT:    je .LBB3_1
+; SKL-NEXT:    je .LBB4_1
 ; SKL-NEXT:  # %bb.2:
 ; SKL-NEXT:    cqto
 ; SKL-NEXT:    idivq %rsi
 ; SKL-NEXT:    retq
-; SKL-NEXT:  .LBB3_1:
+; SKL-NEXT:  .LBB4_1:
 ; SKL-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SKL-NEXT:    xorl %edx, %edx
 ; SKL-NEXT:    divl %esi
@@ -213,6 +231,24 @@
   ret i32 %div
 }
 
+define i32 @div32_pgso(i32 %a, i32 %b) !prof !15 {
+; CHECK-LABEL: div32_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %esi
+; CHECK-NEXT:    retq
+;
+; HUGEWS-LABEL: div32_pgso:
+; HUGEWS:       # %bb.0:
+; HUGEWS-NEXT:    movl %edi, %eax
+; HUGEWS-NEXT:    cltd
+; HUGEWS-NEXT:    idivl %esi
+; HUGEWS-NEXT:    retq
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
+
 define i32 @div32_minsize(i32 %a, i32 %b) minsize {
 ; CHECK-LABEL: div32_minsize:
 ; CHECK:       # %bb.0:
@@ -246,3 +282,4 @@
 !12 = !{i32 10000, i64 1000, i32 1}
 !13 = !{i32 999000, i64 1000, i32 3}
 !14 = !{i32 999999, i64 5, i32 3}
+!15 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/cmov-into-branch.ll b/llvm/test/CodeGen/X86/cmov-into-branch.ll
--- a/llvm/test/CodeGen/X86/cmov-into-branch.ll
+++ b/llvm/test/CodeGen/X86/cmov-into-branch.ll
@@ -88,7 +88,7 @@
 ; CHECK-NEXT:    cmovnel %edi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
   ret i32 %sel
 }
 
@@ -104,7 +104,7 @@
 ; CHECK-NEXT:  .LBB6_2: # %select.end
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !1
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
   ret i32 %sel
 }
 
@@ -124,7 +124,7 @@
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !17
   ret i32 %sel
 }
 
@@ -137,12 +137,51 @@
 ; CHECK-NEXT:    cmovnel %edi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !3
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !18
   ret i32 %sel
 }
 
-!0 = !{!"branch_weights", i32 1, i32 99}
-!1 = !{!"branch_weights", i32 1, i32 100}
-!2 = !{!"branch_weights", i32 100, i32 1}
-!3 = !{!"branch_weights", i32 0, i32 0}
+define i32 @weighted_select_optsize(i32 %a, i32 %b) optsize {
+; CHECK-LABEL: weighted_select_optsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ne i32 %a, 0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
+  ret i32 %sel
+}
+
+define i32 @weighted_select_pgso(i32 %a, i32 %b) !prof !14 {
+; CHECK-LABEL: weighted_select_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ne i32 %a, 0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
+  ret i32 %sel
+}
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
+!15 = !{!"branch_weights", i32 1, i32 99}
+!16 = !{!"branch_weights", i32 1, i32 100}
+!17 = !{!"branch_weights", i32 100, i32 1}
+!18 = !{!"branch_weights", i32 0, i32 0}
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux   -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32
+; RUN: llc < %s -mtriple=x86_64-linux -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64
+; RUN: llc < %s -mtriple=x86_64-win32 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=WIN64
+
+declare void @foo()
+declare void @bar()
+
+define void @f(i32 %x, i32 %y) !prof !14 {
+; CHECK32-LABEL: f:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; CHECK32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x08]
+; CHECK32-NEXT:    jne bar # TAILCALL
+; CHECK32-NEXT:    # encoding: [0x75,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.1: # %bb1
+; CHECK32-NEXT:    jmp foo # TAILCALL
+; CHECK32-NEXT:    # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: f:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
+; CHECK64-NEXT:    jne bar # TAILCALL
+; CHECK64-NEXT:    # encoding: [0x75,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.1: # %bb1
+; CHECK64-NEXT:    jmp foo # TAILCALL
+; CHECK64-NEXT:    # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: f:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; WIN64-NEXT:    jne bar # TAILCALL
+; WIN64-NEXT:    # encoding: [0x75,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.1: # %bb1
+; WIN64-NEXT:    jmp foo # TAILCALL
+; WIN64-NEXT:    # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+entry:
+	%p = icmp eq i32 %x, %y
+  br i1 %p, label %bb1, label %bb2
+bb1:
+  tail call void @foo()
+  ret void
+bb2:
+  tail call void @bar()
+  ret void
+
+; Check that the asm doesn't just look good, but uses the correct encoding.
+}
+
+define void @f_non_leaf(i32 %x, i32 %y) !prof !14 {
+; CHECK32-LABEL: f_non_leaf:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    pushl %ebx # encoding: [0x53]
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    .cfi_offset %ebx, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; CHECK32-NEXT:    #APP
+; CHECK32-NEXT:    #NO_APP
+; CHECK32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x0c]
+; CHECK32-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.1: # %bb1
+; CHECK32-NEXT:    popl %ebx # encoding: [0x5b]
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    jmp foo # TAILCALL
+; CHECK32-NEXT:    # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  .LBB1_2: # %bb2
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebx # encoding: [0x5b]
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    jmp bar # TAILCALL
+; CHECK32-NEXT:    # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: f_non_leaf:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    .cfi_offset %rbx, -16
+; CHECK64-NEXT:    #APP
+; CHECK64-NEXT:    #NO_APP
+; CHECK64-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
+; CHECK64-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.1: # %bb1
+; CHECK64-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK64-NEXT:    .cfi_def_cfa_offset 8
+; CHECK64-NEXT:    jmp foo # TAILCALL
+; CHECK64-NEXT:    # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  .LBB1_2: # %bb2
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK64-NEXT:    .cfi_def_cfa_offset 8
+; CHECK64-NEXT:    jmp bar # TAILCALL
+; CHECK64-NEXT:    # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: f_non_leaf:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    pushq %rbx # encoding: [0x53]
+; WIN64-NEXT:    .seh_pushreg %rbx
+; WIN64-NEXT:    .seh_endprologue
+; WIN64-NEXT:    #APP
+; WIN64-NEXT:    #NO_APP
+; WIN64-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; WIN64-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.1: # %bb1
+; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
+; WIN64-NEXT:    jmp foo # TAILCALL
+; WIN64-NEXT:    # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+; WIN64-NEXT:  .LBB1_2: # %bb2
+; WIN64-NEXT:    nop # encoding: [0x90]
+; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
+; WIN64-NEXT:    jmp bar # TAILCALL
+; WIN64-NEXT:    # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; WIN64-NEXT:    .seh_handlerdata
+; WIN64-NEXT:    .text
+; WIN64-NEXT:    .seh_endproc
+entry:
+  ; Force %ebx to be spilled on the stack, turning this into
+  ; not a "leaf" function for Win64.
+  tail call void asm sideeffect "", "~{ebx}"()
+
+	%p = icmp eq i32 %x, %y
+  br i1 %p, label %bb1, label %bb2
+bb1:
+  tail call void @foo()
+  ret void
+bb2:
+  tail call void @bar()
+  ret void
+
+}
+
+declare x86_thiscallcc zeroext i1 @baz(i8*, i32)
+define x86_thiscallcc zeroext i1 @BlockPlacementTest(i8* %this, i32 %x) !prof !14 {
+; CHECK32-LABEL: BlockPlacementTest:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
+; CHECK32-NEXT:    testb $42, %dl # encoding: [0xf6,0xc2,0x2a]
+; CHECK32-NEXT:    je .LBB2_3 # encoding: [0x74,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.1: # %land.rhs
+; CHECK32-NEXT:    movb $1, %al # encoding: [0xb0,0x01]
+; CHECK32-NEXT:    testb $44, %dl # encoding: [0xf6,0xc2,0x2c]
+; CHECK32-NEXT:    je baz # TAILCALL
+; CHECK32-NEXT:    # encoding: [0x74,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  .LBB2_2: # %land.end
+; CHECK32-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK32-NEXT:    retl $4 # encoding: [0xc2,0x04,0x00]
+; CHECK32-NEXT:  .LBB2_3:
+; CHECK32-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK32-NEXT:    jmp .LBB2_2 # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: BlockPlacementTest:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    testb $42, %sil # encoding: [0x40,0xf6,0xc6,0x2a]
+; CHECK64-NEXT:    je .LBB2_3 # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.1: # %land.rhs
+; CHECK64-NEXT:    movb $1, %al # encoding: [0xb0,0x01]
+; CHECK64-NEXT:    testb $44, %sil # encoding: [0x40,0xf6,0xc6,0x2c]
+; CHECK64-NEXT:    je baz # TAILCALL
+; CHECK64-NEXT:    # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  .LBB2_2: # %land.end
+; CHECK64-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK64-NEXT:    retq # encoding: [0xc3]
+; CHECK64-NEXT:  .LBB2_3:
+; CHECK64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK64-NEXT:    jmp .LBB2_2 # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: BlockPlacementTest:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    testb $42, %dl # encoding: [0xf6,0xc2,0x2a]
+; WIN64-NEXT:    je .LBB2_3 # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.1: # %land.rhs
+; WIN64-NEXT:    movb $1, %al # encoding: [0xb0,0x01]
+; WIN64-NEXT:    testb $44, %dl # encoding: [0xf6,0xc2,0x2c]
+; WIN64-NEXT:    je baz # TAILCALL
+; WIN64-NEXT:    # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1
+; WIN64-NEXT:  .LBB2_2: # %land.end
+; WIN64-NEXT:    # kill: def $al killed $al killed $eax
+; WIN64-NEXT:    retq # encoding: [0xc3]
+; WIN64-NEXT:  .LBB2_3:
+; WIN64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; WIN64-NEXT:    jmp .LBB2_2 # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
+entry:
+  %and = and i32 %x, 42
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %land.end, label %land.rhs
+
+land.rhs:
+  %and6 = and i32 %x, 44
+  %tobool7 = icmp eq i32 %and6, 0
+  br i1 %tobool7, label %lor.rhs, label %land.end
+
+lor.rhs:
+  %call = tail call x86_thiscallcc zeroext i1 @baz(i8* %this, i32 %x) #2
+  br label %land.end
+
+land.end:
+  %0 = phi i1 [ false, %entry ], [ true, %land.rhs ], [ %call, %lor.rhs ]
+  ret i1 %0
+
+; Make sure machine block placement isn't confused by the conditional tail call,
+; but sees that it can fall through to the next block.
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/fixup-lea.ll b/llvm/test/CodeGen/X86/fixup-lea.ll
--- a/llvm/test/CodeGen/X86/fixup-lea.ll
+++ b/llvm/test/CodeGen/X86/fixup-lea.ll
@@ -108,17 +108,96 @@
   ret void
 }
 
+define void @foo_pgso(i32 inreg %dns) !prof !14 {
+; SLOW-LABEL: foo_pgso:
+; SLOW:       # %bb.0: # %entry
+; SLOW-NEXT:    xorl %ecx, %ecx
+; SLOW-NEXT:    decl %ecx
+; SLOW-NEXT:  .LBB4_1: # %for.body
+; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
+; SLOW-NEXT:    movzwl %cx, %edx
+; SLOW-NEXT:    decl %ecx
+; SLOW-NEXT:    cmpl %eax, %edx
+; SLOW-NEXT:    jl .LBB4_1
+; SLOW-NEXT:  # %bb.2: # %for.end
+; SLOW-NEXT:    retl
+;
+; FAST-LABEL: foo_pgso:
+; FAST:       # %bb.0: # %entry
+; FAST-NEXT:    xorl %ecx, %ecx
+; FAST-NEXT:    decl %ecx
+; FAST-NEXT:  .LBB4_1: # %for.body
+; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; FAST-NEXT:    movzwl %cx, %edx
+; FAST-NEXT:    addl $-1, %ecx
+; FAST-NEXT:    cmpl %eax, %edx
+; FAST-NEXT:    jl .LBB4_1
+; FAST-NEXT:  # %bb.2: # %for.end
+; FAST-NEXT:    retl
+entry:
+  br label %for.body
+
+for.body:
+  %i.05 = phi i16 [ %dec, %for.body ], [ 0, %entry ]
+  %dec = add i16 %i.05, -1
+  %conv = zext i16 %dec to i32
+  %cmp = icmp slt i32 %conv, %dns
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+define void @bar_pgso(i32 inreg %dns) !prof !14 {
+; SLOW-LABEL: bar_pgso:
+; SLOW:       # %bb.0: # %entry
+; SLOW-NEXT:    xorl %ecx, %ecx
+; SLOW-NEXT:    incl %ecx
+; SLOW-NEXT:  .LBB5_1: # %for.body
+; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
+; SLOW-NEXT:    movzwl %cx, %edx
+; SLOW-NEXT:    incl %ecx
+; SLOW-NEXT:    cmpl %eax, %edx
+; SLOW-NEXT:    jl .LBB5_1
+; SLOW-NEXT:  # %bb.2: # %for.end
+; SLOW-NEXT:    retl
+;
+; FAST-LABEL: bar_pgso:
+; FAST:       # %bb.0: # %entry
+; FAST-NEXT:    xorl %ecx, %ecx
+; FAST-NEXT:    incl %ecx
+; FAST-NEXT:  .LBB5_1: # %for.body
+; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; FAST-NEXT:    movzwl %cx, %edx
+; FAST-NEXT:    addl $1, %ecx
+; FAST-NEXT:    cmpl %eax, %edx
+; FAST-NEXT:    jl .LBB5_1
+; FAST-NEXT:  # %bb.2: # %for.end
+; FAST-NEXT:    retl
+entry:
+  br label %for.body
+
+for.body:
+  %i.05 = phi i16 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i16 %i.05, 1
+  %conv = zext i16 %inc to i32
+  %cmp = icmp slt i32 %conv, %dns
+  br i1 %cmp, label %for.body, label %for.end
+for.end:
+  ret void
+}
+
 define void @foo_nosize(i32 inreg %dns) {
 ; SLOW-LABEL: foo_nosize:
 ; SLOW:       # %bb.0: # %entry
 ; SLOW-NEXT:    movw $-1, %cx
 ; SLOW-NEXT:    .p2align 4, 0x90
-; SLOW-NEXT:  .LBB4_1: # %for.body
+; SLOW-NEXT:  .LBB6_1: # %for.body
 ; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SLOW-NEXT:    movzwl %cx, %edx
 ; SLOW-NEXT:    decl %ecx
 ; SLOW-NEXT:    cmpl %eax, %edx
-; SLOW-NEXT:    jl .LBB4_1
+; SLOW-NEXT:    jl .LBB6_1
 ; SLOW-NEXT:  # %bb.2: # %for.end
 ; SLOW-NEXT:    retl
 ;
@@ -126,12 +205,12 @@
 ; FAST:       # %bb.0: # %entry
 ; FAST-NEXT:    movw $-1, %cx
 ; FAST-NEXT:    .p2align 4, 0x90
-; FAST-NEXT:  .LBB4_1: # %for.body
+; FAST-NEXT:  .LBB6_1: # %for.body
 ; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
 ; FAST-NEXT:    movzwl %cx, %edx
 ; FAST-NEXT:    addl $-1, %ecx
 ; FAST-NEXT:    cmpl %eax, %edx
-; FAST-NEXT:    jl .LBB4_1
+; FAST-NEXT:    jl .LBB6_1
 ; FAST-NEXT:  # %bb.2: # %for.end
 ; FAST-NEXT:    retl
 entry:
@@ -153,12 +232,12 @@
 ; SLOW:       # %bb.0: # %entry
 ; SLOW-NEXT:    movw $1, %cx
 ; SLOW-NEXT:    .p2align 4, 0x90
-; SLOW-NEXT:  .LBB5_1: # %for.body
+; SLOW-NEXT:  .LBB7_1: # %for.body
 ; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SLOW-NEXT:    movzwl %cx, %edx
 ; SLOW-NEXT:    incl %ecx
 ; SLOW-NEXT:    cmpl %eax, %edx
-; SLOW-NEXT:    jl .LBB5_1
+; SLOW-NEXT:    jl .LBB7_1
 ; SLOW-NEXT:  # %bb.2: # %for.end
 ; SLOW-NEXT:    retl
 ;
@@ -166,12 +245,12 @@
 ; FAST:       # %bb.0: # %entry
 ; FAST-NEXT:    movw $1, %cx
 ; FAST-NEXT:    .p2align 4, 0x90
-; FAST-NEXT:  .LBB5_1: # %for.body
+; FAST-NEXT:  .LBB7_1: # %for.body
 ; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
 ; FAST-NEXT:    movzwl %cx, %edx
 ; FAST-NEXT:    addl $1, %ecx
 ; FAST-NEXT:    cmpl %eax, %edx
-; FAST-NEXT:    jl .LBB5_1
+; FAST-NEXT:    jl .LBB7_1
 ; FAST-NEXT:  # %bb.2: # %for.end
 ; FAST-NEXT:    retl
 entry:
@@ -186,3 +265,20 @@
 for.end:
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/fold-load-unops.ll b/llvm/test/CodeGen/X86/fold-load-unops.ll
--- a/llvm/test/CodeGen/X86/fold-load-unops.ll
+++ b/llvm/test/CodeGen/X86/fold-load-unops.ll
@@ -113,6 +113,38 @@
     ret <4 x float> %res
 }
 
+define float @rcpss_pgso(float* %a) !prof !14 {
+; SSE-LABEL: rcpss_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rcpss_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define <4 x float> @rcpss_full_pgso(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: rcpss_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rcpss_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
 define float @rsqrtss_size(float* %a) optsize {
 ; SSE-LABEL: rsqrtss_size:
 ; SSE:       # %bb.0:
@@ -145,6 +177,38 @@
     ret <4 x float> %res
 }
 
+define float @rsqrtss_pgso(float* %a) !prof !14 {
+; SSE-LABEL: rsqrtss_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rsqrtss_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define <4 x float> @rsqrtss_full_pgso(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: rsqrtss_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rsqrtss_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
 define float @sqrtss_size(float* %a) optsize{
 ; SSE-LABEL: sqrtss_size:
 ; SSE:       # %bb.0:
@@ -196,6 +260,57 @@
     ret <4 x float> %res
 }
 
+define float @sqrtss_pgso(float* %a) !prof !14 {
+; SSE-LABEL: sqrtss_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtss_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define <4 x float> @sqrtss_full_pgso(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: sqrtss_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtss_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
+define <4 x float> @sqrtss_full_pgso_volatile(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: sqrtss_full_pgso_volatile:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtss_full_pgso_volatile:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load volatile <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
 define double @sqrtsd_size(double* %a) optsize {
 ; SSE-LABEL: sqrtsd_size:
 ; SSE:       # %bb.0:
@@ -247,7 +362,75 @@
     ret <2 x double> %res
 }
 
+define double @sqrtsd_pgso(double* %a) !prof !14 {
+; SSE-LABEL: sqrtsd_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtsd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtsd_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load double, double* %a
+    %ins = insertelement <2 x double> undef, double %ld, i32 0
+    %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins)
+    %ext = extractelement <2 x double> %res, i32 0
+    ret double %ext
+}
+
+define <2 x double> @sqrtsd_full_pgso(<2 x double>* %a) !prof !14 {
+; SSE-LABEL: sqrtsd_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd (%rdi), %xmm0
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtsd_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovapd (%rdi), %xmm0
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <2 x double>, <2 x double>* %a
+    %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
+    ret <2 x double> %res
+}
+
+define <2 x double> @sqrtsd_full_pgso_volatile(<2 x double>* %a) !prof !14 {
+; SSE-LABEL: sqrtsd_full_pgso_volatile:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd (%rdi), %xmm0
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtsd_full_pgso_volatile:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovapd (%rdi), %xmm0
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load volatile <2 x double>, <2 x double>* %a
+    %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
+    ret <2 x double> %res
+}
+
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -196,6 +196,26 @@
   ret i32 %tmp
 }
 
+define i32 @var_shift_i32_pgso(i32 %x, i32 %y, i32 %z) nounwind !prof !14 {
+; X86-LABEL: var_shift_i32_pgso:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: var_shift_i32_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shldl %cl, %esi, %eax
+; X64-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %tmp
+}
+
 define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i64:
 ; X86-FAST:       # %bb.0:
@@ -216,36 +236,36 @@
 ; X86-FAST-NEXT:    shll %cl, %edi
 ; X86-FAST-NEXT:    shldl %cl, %eax, %ebp
 ; X86-FAST-NEXT:    testb $32, %bl
-; X86-FAST-NEXT:    je .LBB4_2
+; X86-FAST-NEXT:    je .LBB5_2
 ; X86-FAST-NEXT:  # %bb.1:
 ; X86-FAST-NEXT:    movl %edi, %ebp
 ; X86-FAST-NEXT:    xorl %edi, %edi
-; X86-FAST-NEXT:  .LBB4_2:
+; X86-FAST-NEXT:  .LBB5_2:
 ; X86-FAST-NEXT:    movb $64, %cl
 ; X86-FAST-NEXT:    subb %bl, %cl
 ; X86-FAST-NEXT:    movl %edx, %esi
 ; X86-FAST-NEXT:    shrl %cl, %esi
 ; X86-FAST-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-FAST-NEXT:    testb $32, %cl
-; X86-FAST-NEXT:    jne .LBB4_3
+; X86-FAST-NEXT:    jne .LBB5_3
 ; X86-FAST-NEXT:  # %bb.4:
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-FAST-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-FAST-NEXT:    testl %ebx, %ebx
-; X86-FAST-NEXT:    jne .LBB4_6
-; X86-FAST-NEXT:    jmp .LBB4_7
-; X86-FAST-NEXT:  .LBB4_3:
+; X86-FAST-NEXT:    jne .LBB5_6
+; X86-FAST-NEXT:    jmp .LBB5_7
+; X86-FAST-NEXT:  .LBB5_3:
 ; X86-FAST-NEXT:    movl %esi, %ecx
 ; X86-FAST-NEXT:    xorl %esi, %esi
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-FAST-NEXT:    testl %ebx, %ebx
-; X86-FAST-NEXT:    je .LBB4_7
-; X86-FAST-NEXT:  .LBB4_6:
+; X86-FAST-NEXT:    je .LBB5_7
+; X86-FAST-NEXT:  .LBB5_6:
 ; X86-FAST-NEXT:    orl %esi, %ebp
 ; X86-FAST-NEXT:    orl %ecx, %edi
 ; X86-FAST-NEXT:    movl %edi, %eax
 ; X86-FAST-NEXT:    movl %ebp, %edx
-; X86-FAST-NEXT:  .LBB4_7:
+; X86-FAST-NEXT:  .LBB5_7:
 ; X86-FAST-NEXT:    addl $4, %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
@@ -279,11 +299,11 @@
 ; X86-SLOW-NEXT:    testb %dl, %dl
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    je .LBB4_2
+; X86-SLOW-NEXT:    je .LBB5_2
 ; X86-SLOW-NEXT:  # %bb.1:
 ; X86-SLOW-NEXT:    orl %eax, %ebp
 ; X86-SLOW-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:  .LBB4_2:
+; X86-SLOW-NEXT:  .LBB5_2:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-SLOW-NEXT:    movl %ebp, %eax
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
@@ -294,41 +314,41 @@
 ; X86-SLOW-NEXT:    negb %cl
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    testb %ch, %ch
-; X86-SLOW-NEXT:    je .LBB4_4
+; X86-SLOW-NEXT:    je .LBB5_4
 ; X86-SLOW-NEXT:  # %bb.3:
 ; X86-SLOW-NEXT:    orl %edi, %eax
 ; X86-SLOW-NEXT:    movl %eax, %ebp
-; X86-SLOW-NEXT:  .LBB4_4:
+; X86-SLOW-NEXT:  .LBB5_4:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movl %eax, %edi
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %edi
 ; X86-SLOW-NEXT:    testb $32, %bl
-; X86-SLOW-NEXT:    je .LBB4_6
+; X86-SLOW-NEXT:    je .LBB5_6
 ; X86-SLOW-NEXT:  # %bb.5:
 ; X86-SLOW-NEXT:    movl %edi, %ebp
 ; X86-SLOW-NEXT:    xorl %edi, %edi
-; X86-SLOW-NEXT:  .LBB4_6:
+; X86-SLOW-NEXT:  .LBB5_6:
 ; X86-SLOW-NEXT:    movb %dh, %cl
 ; X86-SLOW-NEXT:    shrl %cl, %esi
 ; X86-SLOW-NEXT:    testb $32, %dh
-; X86-SLOW-NEXT:    jne .LBB4_7
+; X86-SLOW-NEXT:    jne .LBB5_7
 ; X86-SLOW-NEXT:  # %bb.8:
 ; X86-SLOW-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    jne .LBB4_10
-; X86-SLOW-NEXT:    jmp .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_7:
+; X86-SLOW-NEXT:    jne .LBB5_10
+; X86-SLOW-NEXT:    jmp .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_7:
 ; X86-SLOW-NEXT:    movl %esi, %ecx
 ; X86-SLOW-NEXT:    xorl %esi, %esi
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    je .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_10:
+; X86-SLOW-NEXT:    je .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_10:
 ; X86-SLOW-NEXT:    orl %esi, %ebp
 ; X86-SLOW-NEXT:    orl %ecx, %edi
 ; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl %edi, %eax
-; X86-SLOW-NEXT:  .LBB4_11:
+; X86-SLOW-NEXT:  .LBB5_11:
 ; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-SLOW-NEXT:    addl $8, %esp
 ; X86-SLOW-NEXT:    popl %esi
@@ -503,3 +523,20 @@
   %tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 7)
   ret i64 %tmp
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -195,6 +195,26 @@
   ret i32 %tmp
 }
 
+define i32 @var_shift_i32_pgso(i32 %x, i32 %y, i32 %z) nounwind !prof !14 {
+; X86-LABEL: var_shift_i32_pgso:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: var_shift_i32_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdl %cl, %edi, %eax
+; X64-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %tmp
+}
+
 define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i64:
 ; X86-FAST:       # %bb.0:
@@ -216,30 +236,30 @@
 ; X86-FAST-NEXT:    shll %cl, %edi
 ; X86-FAST-NEXT:    shldl %cl, %eax, %esi
 ; X86-FAST-NEXT:    testb $32, %cl
-; X86-FAST-NEXT:    je .LBB4_2
+; X86-FAST-NEXT:    je .LBB5_2
 ; X86-FAST-NEXT:  # %bb.1:
 ; X86-FAST-NEXT:    movl %edi, %esi
 ; X86-FAST-NEXT:    xorl %edi, %edi
-; X86-FAST-NEXT:  .LBB4_2:
+; X86-FAST-NEXT:  .LBB5_2:
 ; X86-FAST-NEXT:    movl %edx, %ebp
 ; X86-FAST-NEXT:    movl %ebx, %ecx
 ; X86-FAST-NEXT:    shrl %cl, %ebp
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-FAST-NEXT:    shrdl %cl, %edx, %eax
 ; X86-FAST-NEXT:    testb $32, %bl
-; X86-FAST-NEXT:    je .LBB4_4
+; X86-FAST-NEXT:    je .LBB5_4
 ; X86-FAST-NEXT:  # %bb.3:
 ; X86-FAST-NEXT:    movl %ebp, %eax
 ; X86-FAST-NEXT:    xorl %ebp, %ebp
-; X86-FAST-NEXT:  .LBB4_4:
+; X86-FAST-NEXT:  .LBB5_4:
 ; X86-FAST-NEXT:    testl %ebx, %ebx
-; X86-FAST-NEXT:    je .LBB4_6
+; X86-FAST-NEXT:    je .LBB5_6
 ; X86-FAST-NEXT:  # %bb.5:
 ; X86-FAST-NEXT:    orl %ebp, %esi
 ; X86-FAST-NEXT:    orl %eax, %edi
 ; X86-FAST-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-FAST-NEXT:    movl %esi, %edx
-; X86-FAST-NEXT:  .LBB4_6:
+; X86-FAST-NEXT:  .LBB5_6:
 ; X86-FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-FAST-NEXT:    addl $4, %esp
 ; X86-FAST-NEXT:    popl %esi
@@ -274,11 +294,11 @@
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    testb %ch, %ch
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    je .LBB4_2
+; X86-SLOW-NEXT:    je .LBB5_2
 ; X86-SLOW-NEXT:  # %bb.1:
 ; X86-SLOW-NEXT:    orl %edi, %edx
 ; X86-SLOW-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:  .LBB4_2:
+; X86-SLOW-NEXT:  .LBB5_2:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    movl %ecx, %edx
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
@@ -291,41 +311,41 @@
 ; X86-SLOW-NEXT:    shll %cl, %edi
 ; X86-SLOW-NEXT:    testb %ah, %ah
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    je .LBB4_4
+; X86-SLOW-NEXT:    je .LBB5_4
 ; X86-SLOW-NEXT:  # %bb.3:
 ; X86-SLOW-NEXT:    orl %edx, %edi
 ; X86-SLOW-NEXT:    movl %edi, %ebp
-; X86-SLOW-NEXT:  .LBB4_4:
+; X86-SLOW-NEXT:  .LBB5_4:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    testb $32, %bl
-; X86-SLOW-NEXT:    je .LBB4_6
+; X86-SLOW-NEXT:    je .LBB5_6
 ; X86-SLOW-NEXT:  # %bb.5:
 ; X86-SLOW-NEXT:    movl %edi, %ebp
 ; X86-SLOW-NEXT:    xorl %edi, %edi
-; X86-SLOW-NEXT:  .LBB4_6:
+; X86-SLOW-NEXT:  .LBB5_6:
 ; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    testb $32, %al
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    jne .LBB4_7
+; X86-SLOW-NEXT:    jne .LBB5_7
 ; X86-SLOW-NEXT:  # %bb.8:
 ; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    jne .LBB4_10
-; X86-SLOW-NEXT:    jmp .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_7:
+; X86-SLOW-NEXT:    jne .LBB5_10
+; X86-SLOW-NEXT:    jmp .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_7:
 ; X86-SLOW-NEXT:    movl %esi, %eax
 ; X86-SLOW-NEXT:    xorl %esi, %esi
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    je .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_10:
+; X86-SLOW-NEXT:    je .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_10:
 ; X86-SLOW-NEXT:    orl %ebp, %esi
 ; X86-SLOW-NEXT:    orl %edi, %eax
 ; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl %eax, %edx
-; X86-SLOW-NEXT:  .LBB4_11:
+; X86-SLOW-NEXT:  .LBB5_11:
 ; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    addl $8, %esp
 ; X86-SLOW-NEXT:    popl %esi
@@ -499,3 +519,20 @@
   %tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 7)
   ret i64 %tmp
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/CodeGen/X86/haddsub.ll
@@ -1983,6 +1983,80 @@
   ret float %x230
 }
 
+define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 {
+; SSE3-LABEL: hadd32_4_pgso:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm1
+; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-NEXT:    addps %xmm0, %xmm1
+; SSE3-NEXT:    haddps %xmm1, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; AVX-LABEL: hadd32_4_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %x227 = fadd <4 x float> %x225, %x226
+  %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %x229 = fadd <4 x float> %x227, %x228
+  %x230 = extractelement <4 x float> %x229, i32 0
+  ret float %x230
+}
+
+define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 {
+; SSE3-LABEL: hadd32_8_pgso:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm1
+; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-NEXT:    addps %xmm0, %xmm1
+; SSE3-NEXT:    haddps %xmm1, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; AVX-LABEL: hadd32_8_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x227 = fadd <8 x float> %x225, %x226
+  %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x229 = fadd <8 x float> %x227, %x228
+  %x230 = extractelement <8 x float> %x229, i32 0
+  ret float %x230
+}
+
+define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 {
+; SSE3-LABEL: hadd32_16_pgso:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm1
+; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-NEXT:    addps %xmm0, %xmm1
+; SSE3-NEXT:    haddps %xmm1, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; AVX-LABEL: hadd32_16_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x227 = fadd <16 x float> %x225, %x226
+  %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x229 = fadd <16 x float> %x227, %x228
+  %x230 = extractelement <16 x float> %x229, i32 0
+  ret float %x230
+}
+
 define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32:
 ; SSE3-SLOW:       # %bb.0:
@@ -2115,3 +2189,20 @@
   %r = extractelement <16 x float> %x0123, i32 0
   ret float %r
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/immediate_merging.ll b/llvm/test/CodeGen/X86/immediate_merging.ll
--- a/llvm/test/CodeGen/X86/immediate_merging.ll
+++ b/llvm/test/CodeGen/X86/immediate_merging.ll
@@ -73,6 +73,68 @@
   ret i32 0
 }
 
+; Test PGSO to make sure immediates with multiple users don't get pulled in to
+; instructions.
+define i32 @foo_pgso() !prof !14 {
+; X86-LABEL: foo_pgso:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $1234, %eax # imm = 0x4D2
+; X86-NEXT:    movl %eax, a
+; X86-NEXT:    movl %eax, b
+; X86-NEXT:    movl $12, %eax
+; X86-NEXT:    movl %eax, c
+; X86-NEXT:    cmpl %eax, e
+; X86-NEXT:    jne .LBB1_2
+; X86-NEXT:  # %bb.1: # %if.then
+; X86-NEXT:    movl $1, x
+; X86-NEXT:  .LBB1_2: # %if.end
+; X86-NEXT:    movl $1234, f # imm = 0x4D2
+; X86-NEXT:    movl $555, %eax # imm = 0x22B
+; X86-NEXT:    movl %eax, h
+; X86-NEXT:    addl %eax, i
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: foo_pgso:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl $1234, %eax # imm = 0x4D2
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    movl $12, %eax
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    cmpl %eax, {{.*}}(%rip)
+; X64-NEXT:    jne .LBB1_2
+; X64-NEXT:  # %bb.1: # %if.then
+; X64-NEXT:    movl $1, {{.*}}(%rip)
+; X64-NEXT:  .LBB1_2: # %if.end
+; X64-NEXT:    movl $1234, {{.*}}(%rip) # imm = 0x4D2
+; X64-NEXT:    movl $555, %eax # imm = 0x22B
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    addl %eax, {{.*}}(%rip)
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+entry:
+  store i32 1234, i32* @a
+  store i32 1234, i32* @b
+  store i32 12, i32* @c
+  %0 = load i32, i32* @e
+  %cmp = icmp eq i32 %0, 12
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @x
+  br label %if.end
+
+; New block.. Make sure 1234 isn't live across basic blocks from before.
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1234, i32* @f
+  store i32 555, i32* @h
+  %1 = load i32, i32* @i
+  %add1 = add nsw i32 %1, 555
+  store i32 %add1, i32* @i
+  ret i32 0
+}
+
 ; Test -O2 to make sure that all immediates get pulled in to their users.
 define i32 @foo2() {
 ; X86-LABEL: foo2:
@@ -124,3 +186,47 @@
   call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i1 false)
   ret void
 }
+
+; memset gets lowered in DAG. Constant merging should hoist all the
+; immediates used to store to the individual memory locations. Make
+; sure we don't directly store the immediates.
+define void @foomemset_pgso() !prof !14 {
+; X86-LABEL: foomemset_pgso:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $555819297, %eax # imm = 0x21212121
+; X86-NEXT:    movl %eax, AA+20
+; X86-NEXT:    movl %eax, AA+16
+; X86-NEXT:    movl %eax, AA+12
+; X86-NEXT:    movl %eax, AA+8
+; X86-NEXT:    movl %eax, AA+4
+; X86-NEXT:    movl %eax, AA
+; X86-NEXT:    retl
+;
+; X64-LABEL: foomemset_pgso:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $2387225703656530209, %rax # imm = 0x2121212121212121
+; X64-NEXT:    movq %rax, AA+{{.*}}(%rip)
+; X64-NEXT:    movq %rax, AA+{{.*}}(%rip)
+; X64-NEXT:    movq %rax, {{.*}}(%rip)
+; X64-NEXT:    retq
+entry:
+  call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i1 false)
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/immediate_merging64.ll b/llvm/test/CodeGen/X86/immediate_merging64.ll
--- a/llvm/test/CodeGen/X86/immediate_merging64.ll
+++ b/llvm/test/CodeGen/X86/immediate_merging64.ll
@@ -19,6 +19,19 @@
   ret i1 %cmp
 }
 
+define i1 @imm_multiple_users_pgso(i64 %a, i64* %b) !prof !14 {
+; CHECK-LABEL: imm_multiple_users_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq $-1, %rax
+; CHECK-NEXT:    movq %rax, (%rsi)
+; CHECK-NEXT:    cmpq %rax, %rdi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  store i64 -1, i64* %b, align 8
+  %cmp = icmp eq i64 %a, -1
+  ret i1 %cmp
+}
+
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
 
 ; Inlined memsets requiring multiple same-sized stores should be lowered using
@@ -34,3 +47,31 @@
   tail call void @llvm.memset.p0i8.i64(i8* %D, i8 0, i64 15, i1 false)
   ret void
 }
+
+define void @memset_zero_pgso(i8* noalias nocapture %D) !prof !14 {
+; CHECK-LABEL: memset_zero_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movq %rax, 7(%rdi)
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    retq
+  tail call void @llvm.memset.p0i8.i64(i8* %D, i8 0, i64 15, i1 false)
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/loop-blocks.ll b/llvm/test/CodeGen/X86/loop-blocks.ll
--- a/llvm/test/CodeGen/X86/loop-blocks.ll
+++ b/llvm/test/CodeGen/X86/loop-blocks.ll
@@ -269,6 +269,35 @@
 
 attributes #0 = { minsize norecurse nounwind optsize readnone uwtable }
 
+; CHECK-LABEL: slightly_more_involved_2_pgso:
+; CHECK-NOT:      jmp .LBB6_1
+; CHECK:          .LBB6_1:
+; CHECK-NEXT:     callq body
+
+define void @slightly_more_involved_2_pgso() norecurse nounwind readnone uwtable !prof !14 {
+entry:
+  br label %loop
+
+loop:
+  call void @body()
+  %t0 = call i32 @get()
+  %t1 = icmp slt i32 %t0, 2
+  br i1 %t1, label %block_a, label %bb
+
+bb:
+  %t2 = call i32 @get()
+  %t3 = icmp slt i32 %t2, 99
+  br i1 %t3, label %exit, label %loop
+
+block_a:
+  call void @bar99()
+  br label %loop
+
+exit:
+  call void @exit()
+  ret void
+}
+
 declare void @bar99() nounwind
 declare void @bar100() nounwind
 declare void @bar101() nounwind
@@ -281,3 +310,20 @@
 declare void @block_a_true_func() nounwind
 declare void @block_a_false_func() nounwind
 declare void @block_a_merge_func() nounwind
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/materialize.ll b/llvm/test/CodeGen/X86/materialize.ll
--- a/llvm/test/CodeGen/X86/materialize.ll
+++ b/llvm/test/CodeGen/X86/materialize.ll
@@ -30,6 +30,21 @@
 ; CHECK64-NEXT:  retq
 }
 
+define i32 @one32_pgso() !prof !14 {
+entry:
+  ret i32 1
+
+; CHECK32-LABEL: one32_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  retl
+
+; FIXME: Figure out the best approach in 64-bit mode.
+; CHECK64-LABEL: one32_pgso:
+; CHECK64:       movl $1, %eax
+; CHECK64-NEXT:  retq
+}
+
 define i32 @one32_minsize() minsize {
 entry:
   ret i32 1
@@ -107,6 +122,16 @@
 ; CHECK32-NEXT:  retl
 }
 
+define i32 @minus_one32_pgso() !prof !14 {
+entry:
+  ret i32 -1
+
+; CHECK32-LABEL: minus_one32_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  retl
+}
+
 define i32 @minus_one32_minsize() minsize {
 entry:
   ret i32 -1
@@ -140,6 +165,28 @@
 ; CHECK32-NEXT:  retl
 }
 
+define i16 @one16_pgso() !prof !14 {
+entry:
+  ret i16 1
+
+; CHECK32-LABEL: one16_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  # kill
+; CHECK32-NEXT:  retl
+}
+
+define i16 @minus_one16_pgso() !prof !14 {
+entry:
+  ret i16 -1
+
+; CHECK32-LABEL: minus_one16_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  # kill
+; CHECK32-NEXT:  retl
+}
+
 define i32 @minus_five32() minsize {
 entry:
   ret i32 -5
@@ -213,4 +260,72 @@
 ; CHECK32:       retl
 }
 
+define i32 @rematerialize_minus_one_pgso() !prof !14 {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; -1 should be re-materialized here instead of getting spilled above.
+  ret i32 -1
+
+; CHECK32-LABEL: rematerialize_minus_one_pgso
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NOT:   %eax
+; CHECK32:       retl
+}
+
+define i32 @rematerialize_minus_one_eflags_pgso(i32 %x) !prof !14 {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; Define eflags.
+  %a = icmp ne i32 %x, 123
+  %b = zext i1 %a to i32
+  ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
+  ; It must therefore not use the xor-dec lowering.
+  %c = select i1 %a, i32 %b, i32 -1
+  ret i32 %c
+
+; CHECK32-LABEL: rematerialize_minus_one_eflags_pgso
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       cmpl
+; CHECK32:       setne
+; CHECK32-NOT:   xorl
+; CHECK32:       movl $-1
+; CHECK32:       cmov
+; CHECK32:       retl
+}
+
 declare x86_thiscallcc void @f(i32)
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -0,0 +1,1064 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare i32 @memcmp(i8*, i8*, i64)
+declare i32 @bcmp(i8*, i8*, i64)
+
+define i32 @length2(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length2_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $2
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_nobuiltin_attr:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length3:
+; X86:       # %bb.0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    rolw $8, %si
+; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_1: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB4_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3:
+; X64:       # %bb.0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    cmpw %cx, %ax
+; X64-NEXT:    jne .LBB4_1
+; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    movzbl 2(%rdi), %eax
+; X64-NEXT:    movzbl 2(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_1: # %res_block
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length3_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %edx
+; X86-NEXT:    xorw (%eax), %dx
+; X86-NEXT:    movb 2(%ecx), %cl
+; X86-NEXT:    xorb 2(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orw %dx, %ax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    xorw (%rsi), %ax
+; X64-NEXT:    movb 2(%rdi), %cl
+; X64-NEXT:    xorb 2(%rsi), %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    orw %ax, %cx
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %ecx
+; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length4_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length4_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq_const:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length5:
+; X86:       # %bb.0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB9_1
+; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB9_3
+; X86-NEXT:  .LBB9_1: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB9_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5:
+; X64:       # %bb.0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    jne .LBB9_1
+; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_1: # %res_block
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length5_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    movb 4(%ecx), %cl
+; X86-NEXT:    xorb 4(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    xorl (%rsi), %eax
+; X64-NEXT:    movb 4(%rdi), %cl
+; X64-NEXT:    xorb 4(%rsi), %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB11_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB11_3
+; X86-NEXT:  .LBB11_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB11_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length8_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length8_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
+; X86-NEXT:    xorl (%eax), %ecx
+; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
+; X86-NEXT:    xorl 4(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length12_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    xorq (%rsi), %rax
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    xorl 8(%rsi), %ecx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length12:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB15_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    movl 8(%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    je .LBB15_3
+; X64-NEXT:  .LBB15_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB15_3: # %endblock
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length16:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB16_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    je .LBB16_3
+; X64-NEXT:  .LBB16_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB16_3: # %endblock
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length24:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length24:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $24, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length24_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length24_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length24_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length24_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vpxor {{.*}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $32, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX1-LABEL: length32_eq:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
+; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX1-NEXT:    sete %al
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX1-LABEL: length32_eq_const:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX1-NEXT:    setne %al
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq_const:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-LABEL: length64_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length64_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX1-LABEL: length64_eq:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
+; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
+; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
+; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX1-NEXT:    setne %al
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: length64_eq:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
+; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length64_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length64_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX1-LABEL: length64_eq_const:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
+; X64-AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX1-NEXT:    sete %al
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: length64_eq_const:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
+; X64-AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: bcmp_length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: bcmp_length2:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @bcmp(i8* %X, i8* %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll
--- a/llvm/test/CodeGen/X86/memcpy.ll
+++ b/llvm/test/CodeGen/X86/memcpy.ll
@@ -139,6 +139,36 @@
   ret void
 }
 
+define void @test3_pgso(i8* nocapture %A, i8* nocapture %B) nounwind noredzone !prof !14 {
+; LINUX-LABEL: test3_pgso:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    movl $64, %edx
+; LINUX-NEXT:    jmp memcpy # TAILCALL
+;
+; DARWIN-LABEL: test3_pgso:
+; DARWIN:       ## %bb.0: ## %entry
+; DARWIN-NEXT:    movq 56(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 56(%rdi)
+; DARWIN-NEXT:    movq 48(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 48(%rdi)
+; DARWIN-NEXT:    movq 40(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 40(%rdi)
+; DARWIN-NEXT:    movq 32(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 32(%rdi)
+; DARWIN-NEXT:    movq 24(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 24(%rdi)
+; DARWIN-NEXT:    movq 16(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 16(%rdi)
+; DARWIN-NEXT:    movq (%rsi), %rax
+; DARWIN-NEXT:    movq 8(%rsi), %rcx
+; DARWIN-NEXT:    movq %rcx, 8(%rdi)
+; DARWIN-NEXT:    movq %rax, (%rdi)
+; DARWIN-NEXT:    retq
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false)
+  ret void
+}
+
 define void @test3_minsize(i8* nocapture %A, i8* nocapture %B) nounwind minsize noredzone {
 ; DARWIN-LABEL: test3_minsize:
 ; DARWIN:       ## %bb.0:
@@ -506,3 +536,20 @@
   tail call void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* align 8 %a, i8 addrspace(256)* align 8 %b, i64 16, i1 false)
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/powi.ll b/llvm/test/CodeGen/X86/powi.ll
--- a/llvm/test/CodeGen/X86/powi.ll
+++ b/llvm/test/CodeGen/X86/powi.ll
@@ -86,6 +86,39 @@
   ret double %ret
 }
 
+define double @pow_wrapper_pgso(double %a) !prof !14 {
+; X86-X87-LABEL: pow_wrapper_pgso:
+; X86-X87:       # %bb.0:
+; X86-X87-NEXT:    subl $12, %esp
+; X86-X87-NEXT:    .cfi_def_cfa_offset 16
+; X86-X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-X87-NEXT:    fstpl (%esp)
+; X86-X87-NEXT:    movl $15, {{[0-9]+}}(%esp)
+; X86-X87-NEXT:    calll __powidf2
+; X86-X87-NEXT:    addl $12, %esp
+; X86-X87-NEXT:    .cfi_def_cfa_offset 4
+; X86-X87-NEXT:    retl
+;
+; X86-SSE-LABEL: pow_wrapper_pgso:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE-NEXT:    movl $15, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    calll __powidf2
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE-NEXT:    retl
+;
+; X64-LABEL: pow_wrapper_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $15, %edi
+; X64-NEXT:    jmp __powidf2 # TAILCALL
+  %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
+  ret double %ret
+}
+
 define double @pow_wrapper_minsize(double %a) minsize {
 ; X86-X87-LABEL: pow_wrapper_minsize:
 ; X86-X87:       # %bb.0:
@@ -124,3 +157,19 @@
 
 declare double @llvm.powi.f64(double, i32) nounwind readonly
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/rounding-ops.ll b/llvm/test/CodeGen/X86/rounding-ops.ll
--- a/llvm/test/CodeGen/X86/rounding-ops.ll
+++ b/llvm/test/CodeGen/X86/rounding-ops.ll
@@ -252,3 +252,60 @@
   %call = tail call double @trunc(double %x) nounwind readnone
   ret double %call
 }
+
+define float @test11_pgso(float* %xptr) nounwind !prof !14 {
+; CHECK-SSE-LABEL: test11_pgso:
+; CHECK-SSE:       ## %bb.0:
+; CHECK-SSE-NEXT:    roundss $11, (%rdi), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX-LABEL: test11_pgso:
+; CHECK-AVX:       ## %bb.0:
+; CHECK-AVX-NEXT:    vroundss $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: test11_pgso:
+; CHECK-AVX512:       ## %bb.0:
+; CHECK-AVX512-NEXT:    vroundss $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    retq
+  %x = load float, float* %xptr
+  %call = tail call float @truncf(float %x) nounwind readnone
+  ret float %call
+}
+
+define double @test12_pgso(double* %xptr) nounwind !prof !14 {
+; CHECK-SSE-LABEL: test12_pgso:
+; CHECK-SSE:       ## %bb.0:
+; CHECK-SSE-NEXT:    roundsd $11, (%rdi), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX-LABEL: test12_pgso:
+; CHECK-AVX:       ## %bb.0:
+; CHECK-AVX-NEXT:    vroundsd $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: test12_pgso:
+; CHECK-AVX512:       ## %bb.0:
+; CHECK-AVX512-NEXT:    vroundsd $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    retq
+  %x = load double, double* %xptr
+  %call = tail call double @trunc(double %x) nounwind readnone
+  ret double %call
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/shrink-compare-pgso.ll b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+
+declare void @bar()
+
+define void @test1(i32* nocapture %X) nounwind !prof !14 {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $47, (%rdi)
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %tmp1 = load i32, i32* %X, align 4
+  %and = and i32 %tmp1, 255
+  %cmp = icmp eq i32 %and, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test2(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $47, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and i32 %X, 255
+  %cmp = icmp eq i32 %and, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test3(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and i32 %X, 255
+  %cmp = icmp eq i32 %and, 255
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; PR16083
+define i1 @test4(i64 %a, i32 %b) {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    je .LBB3_1
+; CHECK-NEXT:  # %bb.2: # %lor.end
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB3_1: # %lor.rhs
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+entry:
+  %tobool = icmp ne i32 %b, 0
+  br i1 %tobool, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %entry
+  %and = and i64 0, %a
+  %tobool1 = icmp ne i64 %and, 0
+  br label %lor.end
+
+lor.end:                                          ; preds = %lor.rhs, %entry
+  %p = phi i1 [ true, %entry ], [ %tobool1, %lor.rhs ]
+  ret i1 %p
+}
+
+@x = global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 1 }, align 4
+
+; PR16551
+define void @test5(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl x+{{.*}}(%rip), %eax
+; CHECK-NEXT:    shll $16, %eax
+; CHECK-NEXT:    movzwl x+{{.*}}(%rip), %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    cmpl $1, %ecx
+; CHECK-NEXT:    jne bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %bf.load = load i56, i56* bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @x to i56*), align 4
+  %bf.lshr = lshr i56 %bf.load, 32
+  %bf.cast = trunc i56 %bf.lshr to i32
+  %cmp = icmp ne i32 %bf.cast, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test2_1(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    cmpl $256, %eax # imm = 0x100
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and i32 %X, 255
+  %cmp = icmp eq i32 %and, 256
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_1(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_47(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_47:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $47, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_127(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_127:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $127, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 127
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg1(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg2(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-2, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg127(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg127:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-127, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -127
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg128(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-128, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -128
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_255(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_255:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 255
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/slow-incdec.ll b/llvm/test/CodeGen/X86/slow-incdec.ll
--- a/llvm/test/CodeGen/X86/slow-incdec.ll
+++ b/llvm/test/CodeGen/X86/slow-incdec.ll
@@ -54,6 +54,26 @@
   ret i32 %r
 }
 
+define i32 @inc_pgso(i32 %x) !prof !14 {
+; CHECK-LABEL: inc_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    incl %eax
+; CHECK-NEXT:    retl
+  %r = add i32 %x, 1
+  ret i32 %r
+}
+
+define i32 @dec_pgso(i32 %x) !prof !14 {
+; CHECK-LABEL: dec_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    decl %eax
+; CHECK-NEXT:    retl
+  %r = add i32 %x, -1
+  ret i32 %r
+}
+
 declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
 declare void @other(i32* ) nounwind;
 
@@ -62,20 +82,20 @@
 ; INCDEC:       # %bb.0: # %entry
 ; INCDEC-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; INCDEC-NEXT:    incl (%eax)
-; INCDEC-NEXT:    jne .LBB4_1
+; INCDEC-NEXT:    jne .LBB6_1
 ; INCDEC-NEXT:  # %bb.2: # %if.end4
 ; INCDEC-NEXT:    jmp other # TAILCALL
-; INCDEC-NEXT:  .LBB4_1: # %return
+; INCDEC-NEXT:  .LBB6_1: # %return
 ; INCDEC-NEXT:    retl
 ;
 ; ADD-LABEL: cond_ae_to_cond_ne:
 ; ADD:       # %bb.0: # %entry
 ; ADD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ADD-NEXT:    addl $1, (%eax)
-; ADD-NEXT:    jne .LBB4_1
+; ADD-NEXT:    jne .LBB6_1
 ; ADD-NEXT:  # %bb.2: # %if.end4
 ; ADD-NEXT:    jmp other # TAILCALL
-; ADD-NEXT:  .LBB4_1: # %return
+; ADD-NEXT:  .LBB6_1: # %return
 ; ADD-NEXT:    retl
 entry:
   %t0 = load i32, i32* %p, align 8
@@ -109,10 +129,10 @@
 ; INCDEC-NEXT:    incb a
 ; INCDEC-NEXT:    sete d
 ; INCDEC-NEXT:    testb %al, %al
-; INCDEC-NEXT:    jne .LBB5_2
+; INCDEC-NEXT:    jne .LBB7_2
 ; INCDEC-NEXT:  # %bb.1: # %then
 ; INCDEC-NEXT:    jmp external_a # TAILCALL
-; INCDEC-NEXT:  .LBB5_2: # %else
+; INCDEC-NEXT:  .LBB7_2: # %else
 ; INCDEC-NEXT:    jmp external_b # TAILCALL
 ;
 ; ADD-LABEL: test_tail_call:
@@ -123,10 +143,10 @@
 ; ADD-NEXT:    addb $1, a
 ; ADD-NEXT:    sete d
 ; ADD-NEXT:    testb %al, %al
-; ADD-NEXT:    jne .LBB5_2
+; ADD-NEXT:    jne .LBB7_2
 ; ADD-NEXT:  # %bb.1: # %then
 ; ADD-NEXT:    jmp external_a # TAILCALL
-; ADD-NEXT:  .LBB5_2: # %else
+; ADD-NEXT:  .LBB7_2: # %else
 ; ADD-NEXT:    jmp external_b # TAILCALL
 entry:
   %val = load i32, i32* %ptr
@@ -152,3 +172,19 @@
   ret void
 }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll
--- a/llvm/test/CodeGen/X86/splat-for-size.ll
+++ b/llvm/test/CodeGen/X86/splat-for-size.ll
@@ -17,6 +17,17 @@
   ret <2 x double> %add
 }
 
+define <2 x double> @splat_v2f64_pgso(<2 x double> %x) !prof !14 {
+; CHECK-LABEL: splat_v2f64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; CHECK-NEXT:    # xmm1 = mem[0,0]
+; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %add = fadd <2 x double> %x, <double 1.0, double 1.0>
+  ret <2 x double> %add
+}
+
 define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
 ; CHECK-LABEL: splat_v4f64:
 ; CHECK:       # %bb.0:
@@ -27,6 +38,16 @@
   ret <4 x double> %add
 }
 
+define <4 x double> @splat_v4f64_pgso(<4 x double> %x) !prof !14 {
+; CHECK-LABEL: splat_v4f64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  ret <4 x double> %add
+}
+
 define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: splat_v4f32:
 ; CHECK:       # %bb.0:
@@ -37,6 +58,16 @@
   ret <4 x float> %add
 }
 
+define <4 x float> @splat_v4f32_pgso(<4 x float> %x) !prof !14 {
+; CHECK-LABEL: splat_v4f32_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <4 x float> %add
+}
+
 define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
 ; CHECK-LABEL: splat_v8f32:
 ; CHECK:       # %bb.0:
@@ -47,6 +78,16 @@
   ret <8 x float> %add
 }
 
+define <8 x float> @splat_v8f32_pgso(<8 x float> %x) !prof !14 {
+; CHECK-LABEL: splat_v8f32_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <8 x float> %add
+}
+
 ; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value.
 ; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq.
 define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 {
@@ -66,6 +107,23 @@
   ret <2 x i64> %add
 }
 
+define <2 x i64> @splat_v2i64_pgso(<2 x i64> %x) !prof !14 {
+; AVX-LABEL: splat_v2i64_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [2,2]
+; AVX-NEXT:    # xmm1 = mem[0,0]
+; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v2i64_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <2 x i64> %x, <i64 2, i64 2>
+  ret <2 x i64> %add
+}
+
 ; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors,
 ; and then we fake it: use vmovddup to splat 64-bit value.
 define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
@@ -88,6 +146,26 @@
   ret <4 x i64> %add
 }
 
+define <4 x i64> @splat_v4i64_pgso(<4 x i64> %x) !prof !14 {
+; AVX-LABEL: splat_v4i64_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [2,2]
+; AVX-NEXT:    # xmm2 = mem[0,0]
+; AVX-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v4i64_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2]
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <4 x i64> %x, <i64 2, i64 2, i64 2, i64 2>
+  ret <4 x i64> %add
+}
+
 ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
 define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 {
 ; AVX-LABEL: splat_v4i32:
@@ -105,6 +183,22 @@
   ret <4 x i32> %add
 }
 
+define <4 x i32> @splat_v4i32_pgso(<4 x i32> %x) !prof !14 {
+; AVX-LABEL: splat_v4i32_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2,2,2,2]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v4i32_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i32> %add
+}
+
 ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
 define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
 ; AVX-LABEL: splat_v8i32:
@@ -125,6 +219,25 @@
   ret <8 x i32> %add
 }
 
+define <8 x i32> @splat_v8i32_pgso(<8 x i32> %x) !prof !14 {
+; AVX-LABEL: splat_v8i32_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2,2,2,2]
+; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v8i32_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i32> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
 define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 {
 ; AVX-LABEL: splat_v8i16:
@@ -141,6 +254,21 @@
   ret <8 x i16> %add
 }
 
+define <8 x i16> @splat_v8i16_pgso(<8 x i16> %x) !prof !14 {
+; AVX-LABEL: splat_v8i16_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v8i16_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <8 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
 define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
 ; AVX-LABEL: splat_v16i16:
@@ -161,6 +289,25 @@
   ret <16 x i16> %add
 }
 
+define <16 x i16> @splat_v16i16_pgso(<16 x i16> %x) !prof !14 {
+; AVX-LABEL: splat_v16i16_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
+; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v16i16_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <16 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <16 x i16> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
 define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 {
 ; AVX-LABEL: splat_v16i8:
@@ -177,6 +324,21 @@
   ret <16 x i8> %add
 }
 
+define <16 x i8> @splat_v16i8_pgso(<16 x i8> %x) !prof !14 {
+; AVX-LABEL: splat_v16i8_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v16i8_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <16 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+  ret <16 x i8> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
 define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
 ; AVX-LABEL: splat_v32i8:
@@ -197,6 +359,25 @@
   ret <32 x i8> %add
 }
 
+define <32 x i8> @splat_v32i8_pgso(<32 x i8> %x) !prof !14 {
+; AVX-LABEL: splat_v32i8_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v32i8_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <32 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+  ret <32 x i8> %add
+}
+
 ; PR23259: Verify that ISel doesn't crash with a 'fatal error in backend'
 ; due to a missing AVX pattern to select a v2i64 X86ISD::BROADCAST of a
 ; loadi64 with multiple uses.
@@ -238,3 +419,20 @@
 
 attributes #0 = { optsize }
 attributes #1 = { minsize }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
--- a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
+++ b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
@@ -19,6 +19,23 @@
 
 }
 
+define void @zero_pgso(i32* %p) !prof !14 {
+; CHECK32-LABEL: zero_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl $0, (%eax)
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: zero_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movl $0, (%rdi)
+; CHECK64-NEXT:    retq
+entry:
+  store i32 0, i32* %p
+  ret void
+
+}
+
 define void @minus_one_optsize(i32* %p) optsize {
 ; CHECK32-LABEL: minus_one_optsize:
 ; CHECK32:       # %bb.0: # %entry
@@ -36,6 +53,22 @@
 
 }
 
+define void @minus_one_pgso(i32* %p) !prof !14 {
+; CHECK32-LABEL: minus_one_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl $-1, (%eax)
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: minus_one_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movl $-1, (%rdi)
+; CHECK64-NEXT:    retq
+entry:
+  store i32 -1, i32* %p
+  ret void
+
+}
 
 define void @zero_64(i64* %p) minsize {
 ; CHECK32-LABEL: zero_64:
@@ -244,3 +277,20 @@
   store volatile i16 -1, i16* %p
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/switch-density.ll b/llvm/test/CodeGen/X86/switch-density.ll
--- a/llvm/test/CodeGen/X86/switch-density.ll
+++ b/llvm/test/CodeGen/X86/switch-density.ll
@@ -79,3 +79,72 @@
 ; CHECK: ja
 ; CHECK: jmpq *.LJTI
 }
+
+define void @dense_optsize(i32 %x) optsize {
+entry:
+  switch i32 %x, label %return [
+    i32 12, label %bb0
+    i32 4,  label %bb1
+    i32 16, label %bb1
+    i32 20, label %bb2
+    i32 8,  label %bb3
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+bb3: tail call void @g(i32 2) br label %return
+return: ret void
+
+; Lowered as branches.
+; CHECK-LABEL: dense_optsize
+; CHECK: cmpl $11
+; CHECK: cmpl $20
+; CHECK: cmpl $16
+; CHECK: cmpl $12
+; CHECK: cmpl $4
+; CHECK: cmpl $8
+; CHECK: retq
+}
+
+define void @dense_pgso(i32 %x) !prof !14 {
+entry:
+  switch i32 %x, label %return [
+    i32 12, label %bb0
+    i32 4,  label %bb1
+    i32 16, label %bb1
+    i32 20, label %bb2
+    i32 8,  label %bb3
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+bb3: tail call void @g(i32 2) br label %return
+return: ret void
+
+; Lowered as branches.
+; CHECK-LABEL: dense_pgso
+; CHECK: cmpl $11
+; CHECK: cmpl $20
+; CHECK: cmpl $16
+; CHECK: cmpl $12
+; CHECK: cmpl $4
+; CHECK: cmpl $8
+; CHECK: retq
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/tail-opts.ll b/llvm/test/CodeGen/X86/tail-opts.ll
--- a/llvm/test/CodeGen/X86/tail-opts.ll
+++ b/llvm/test/CodeGen/X86/tail-opts.ll
@@ -480,6 +480,47 @@
   ret void
 }
 
+define void @one_pgso(i32 %v) nounwind !prof !14 {
+; CHECK-LABEL: one_pgso:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB6_3
+; CHECK-NEXT:  # %bb.1: # %bby
+; CHECK-NEXT:    cmpl $16, %edi
+; CHECK-NEXT:    je .LBB6_4
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    jmp tail_call_me # TAILCALL
+; CHECK-NEXT:  .LBB6_3: # %bbx
+; CHECK-NEXT:    cmpl $128, %edi
+; CHECK-NEXT:    jne tail_call_me # TAILCALL
+; CHECK-NEXT:  .LBB6_4: # %return
+; CHECK-NEXT:    retq
+entry:
+  %0 = icmp eq i32 %v, 0
+  br i1 %0, label %bbx, label %bby
+
+bby:
+  switch i32 %v, label %bb7 [
+    i32 16, label %return
+  ]
+
+bb7:
+  tail call void @tail_call_me()
+  ret void
+
+bbx:
+  switch i32 %v, label %bb12 [
+    i32 128, label %return
+  ]
+
+bb12:
+  tail call void @tail_call_me()
+  ret void
+
+return:
+  ret void
+}
+
 ; two - Same as one, but with two instructions in the common
 ; tail instead of one. This is too much to be merged, given
 ; the optsize attribute.
@@ -491,10 +532,51 @@
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je .LBB6_1
+; CHECK-NEXT:    je .LBB7_1
 ; CHECK-NEXT:  # %bb.2: # %return
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB6_1: # %bb7
+; CHECK-NEXT:  .LBB7_1: # %bb7
+; CHECK-NEXT:    movl $0, {{.*}}(%rip)
+; CHECK-NEXT:    movl $1, {{.*}}(%rip)
+entry:
+  %0 = icmp eq i32 undef, 0
+  br i1 %0, label %bbx, label %bby
+
+bby:
+  switch i32 undef, label %bb7 [
+    i32 16, label %return
+  ]
+
+bb7:
+  store volatile i32 0, i32* @XYZ
+  store volatile i32 1, i32* @XYZ
+  unreachable
+
+bbx:
+  switch i32 undef, label %bb12 [
+    i32 128, label %return
+  ]
+
+bb12:
+  store volatile i32 0, i32* @XYZ
+  store volatile i32 1, i32* @XYZ
+  unreachable
+
+return:
+  ret void
+}
+
+define void @two_pgso() nounwind !prof !14 {
+; CHECK-LABEL: two_pgso:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je .LBB8_1
+; CHECK-NEXT:  # %bb.2: # %return
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB8_1: # %bb7
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    movl $1, {{.*}}(%rip)
 entry:
@@ -534,10 +616,10 @@
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je .LBB7_1
+; CHECK-NEXT:    je .LBB9_1
 ; CHECK-NEXT:  # %bb.2: # %return
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB7_1: # %bb7
+; CHECK-NEXT:  .LBB9_1: # %bb7
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    movl $1, {{.*}}(%rip)
 entry:
@@ -575,20 +657,20 @@
 ; CHECK-LABEL: two_nosize:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    je .LBB8_3
+; CHECK-NEXT:    je .LBB10_3
 ; CHECK-NEXT:  # %bb.1: # %bby
 ; CHECK-NEXT:    testl %esi, %esi
-; CHECK-NEXT:    je .LBB8_4
+; CHECK-NEXT:    je .LBB10_4
 ; CHECK-NEXT:  # %bb.2: # %bb7
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    jmp tail_call_me # TAILCALL
-; CHECK-NEXT:  .LBB8_3: # %bbx
+; CHECK-NEXT:  .LBB10_3: # %bbx
 ; CHECK-NEXT:    cmpl $-1, %edx
-; CHECK-NEXT:    je .LBB8_4
+; CHECK-NEXT:    je .LBB10_4
 ; CHECK-NEXT:  # %bb.5: # %bb12
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    jmp tail_call_me # TAILCALL
-; CHECK-NEXT:  .LBB8_4: # %return
+; CHECK-NEXT:  .LBB10_4: # %return
 ; CHECK-NEXT:    retq
 entry:
   %0 = icmp eq i32 %x, 0
@@ -628,11 +710,11 @@
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    cmovgq %rdi, %rax
 ; CHECK-NEXT:    testq %rsi, %rsi
-; CHECK-NEXT:    jle .LBB9_2
+; CHECK-NEXT:    jle .LBB11_2
 ; CHECK-NEXT:  # %bb.1: # %bb.nph
 ; CHECK-NEXT:    imulq %rdi, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:  .LBB9_2: # %for.end
+; CHECK-NEXT:  .LBB11_2: # %for.end
 ; CHECK-NEXT:    retq
 entry:
   %cmp = icmp slt i64 %parami, 1                  ; <i1> [#uses=1]
@@ -661,24 +743,24 @@
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.1: # %cont1
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.2: # %cont2
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.3: # %cont3
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.4: # %cont4
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB10_5: # %abort1
+; CHECK-NEXT:  .LBB12_5: # %abort1
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq abort
 entry:
@@ -721,27 +803,27 @@
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_5
+; CHECK-NEXT:    je .LBB13_5
 ; CHECK-NEXT:  # %bb.1: # %cont1
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_6
+; CHECK-NEXT:    je .LBB13_6
 ; CHECK-NEXT:  # %bb.2: # %cont2
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_5
+; CHECK-NEXT:    je .LBB13_5
 ; CHECK-NEXT:  # %bb.3: # %cont3
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_6
+; CHECK-NEXT:    je .LBB13_6
 ; CHECK-NEXT:  # %bb.4: # %cont4
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB11_5: # %abort1
+; CHECK-NEXT:  .LBB13_5: # %abort1
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq abort
-; CHECK-NEXT:  .LBB11_6: # %abort2
+; CHECK-NEXT:  .LBB13_6: # %abort2
 ; CHECK-NEXT:    callq alt_abort
 entry:
   %c1 = call i1 @qux()
@@ -770,3 +852,20 @@
 cont4:
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/test-vs-bittest.ll b/llvm/test/CodeGen/X86/test-vs-bittest.ll
--- a/llvm/test/CodeGen/X86/test-vs-bittest.ll
+++ b/llvm/test/CodeGen/X86/test-vs-bittest.ll
@@ -49,6 +49,30 @@
   ret void
 }
 
+define void @test64_pgso(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jb .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB2_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 2048
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %yes, label %no
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
 ; This test is identical to test64 above with only the destination of the br
 ; reversed. This somehow causes the two functions to get slightly different
 ; initial IR. One has an extra invert of the setcc. This previous caused one
@@ -60,10 +84,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    je .LBB2_2
+; CHECK-NEXT:    je .LBB3_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB2_2: # %no
+; CHECK-NEXT:  .LBB3_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -84,10 +108,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jae .LBB3_2
+; CHECK-NEXT:    jae .LBB4_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB3_2: # %no
+; CHECK-NEXT:  .LBB4_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 2048
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test64_pgso_2(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jae .LBB5_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB5_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -108,10 +156,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jb .LBB4_2
+; CHECK-NEXT:    jb .LBB6_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB4_2: # %no
+; CHECK-NEXT:  .LBB6_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -132,10 +180,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jb .LBB5_2
+; CHECK-NEXT:    jb .LBB7_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB5_2: # %no
+; CHECK-NEXT:  .LBB7_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 4294967296
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %yes, label %no
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test64_pgso_3(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btq $32, %rdi
+; CHECK-NEXT:    jb .LBB8_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB8_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -156,10 +228,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jae .LBB6_2
+; CHECK-NEXT:    jae .LBB9_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB6_2: # %no
+; CHECK-NEXT:  .LBB9_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -180,10 +252,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jae .LBB7_2
+; CHECK-NEXT:    jae .LBB10_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB7_2: # %no
+; CHECK-NEXT:  .LBB10_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 4294967296
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test64_pgso_4(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btq $32, %rdi
+; CHECK-NEXT:    jae .LBB11_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB11_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -204,10 +300,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    jne .LBB8_2
+; CHECK-NEXT:    jne .LBB12_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB8_2: # %no
+; CHECK-NEXT:  .LBB12_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -228,10 +324,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jb .LBB9_2
+; CHECK-NEXT:    jb .LBB13_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB9_2: # %no
+; CHECK-NEXT:  .LBB13_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -252,10 +348,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    je .LBB10_2
+; CHECK-NEXT:    je .LBB14_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB10_2: # %no
+; CHECK-NEXT:  .LBB14_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -276,10 +372,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jae .LBB11_2
+; CHECK-NEXT:    jae .LBB15_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB11_2: # %no
+; CHECK-NEXT:  .LBB15_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i32 %x, 2048
+  %s = icmp eq i32 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test32_pgso_2(i32 inreg %x) !prof !14 {
+; CHECK-LABEL: test32_pgso_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jae .LBB16_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB16_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -300,10 +420,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    jne .LBB12_2
+; CHECK-NEXT:    jne .LBB17_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB12_2: # %no
+; CHECK-NEXT:  .LBB17_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -324,10 +444,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jb .LBB13_2
+; CHECK-NEXT:    jb .LBB18_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB13_2: # %no
+; CHECK-NEXT:  .LBB18_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i16 %x, 2048
+  %s = icmp eq i16 %t, 0
+  br i1 %s, label %yes, label %no
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test16_pgso(i16 inreg %x) !prof !14 {
+; CHECK-LABEL: test16_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jb .LBB19_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB19_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -348,10 +492,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    je .LBB14_2
+; CHECK-NEXT:    je .LBB20_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB14_2: # %no
+; CHECK-NEXT:  .LBB20_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -372,10 +516,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jae .LBB15_2
+; CHECK-NEXT:    jae .LBB21_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB15_2: # %no
+; CHECK-NEXT:  .LBB21_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i16 %x, 2048
+  %s = icmp eq i16 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test16_pgso_2(i16 inreg %x) !prof !14 {
+; CHECK-LABEL: test16_pgso_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jae .LBB22_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB22_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -512,3 +680,20 @@
 }
 
 declare void @bar()
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2002,6 +2002,56 @@
   ret <8 x i32> %b
 }
 
+define <4 x double> @shuffle_v4f64_0zzz_pgso(<4 x double> %a) !prof !14 {
+; ALL-LABEL: shuffle_v4f64_0zzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; ALL-NEXT:    retq
+  %b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x double> %b
+}
+
+define <4 x i64> @shuffle_v4i64_0zzz_pgso(<4 x i64> %a) !prof !14 {
+; ALL-LABEL: shuffle_v4i64_0zzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; ALL-NEXT:    retq
+  %b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i64> %b
+}
+
+define <8 x float> @shuffle_v8f32_0zzzzzzz_pgso(<8 x float> %a) !prof !14 {
+; AVX1OR2-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512VL-NEXT:    retq
+  %b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x float> %b
+}
+
+define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 {
+; AVX1OR2-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512VL-NEXT:    retq
+  %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i32> %b
+}
+
 define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) {
 ; ALL-LABEL: unpckh_v4i64:
 ; ALL:       # %bb.0:
@@ -2022,3 +2072,19 @@
   ret <4 x double> %unpckh
 }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
--- a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
+++ b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
@@ -240,3 +240,140 @@
   %a = xor i64 %x, 9223372036854775808 ; toggle bit 63
   ret i64 %a
 }
+
+define i64 @and1_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and1_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $31, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 18446744071562067967 ; clear bit 31
+  ret i64 %a
+}
+
+define i64 @and2_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and2_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $32, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 18446744069414584319 ; clear bit 32
+  ret i64 %a
+}
+
+define i64 @and3_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and3_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $62, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 13835058055282163711 ; clear bit 62
+  ret i64 %a
+}
+
+define i64 @and4_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and4_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $63, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 9223372036854775807 ; clear bit 63
+  ret i64 %a
+}
+
+define i64 @or1_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or1_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $31, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 2147483648 ; set bit 31
+  ret i64 %a
+}
+
+define i64 @or2_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or2_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $32, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 4294967296 ; set bit 32
+  ret i64 %a
+}
+
+define i64 @or3_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or3_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $62, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 4611686018427387904 ; set bit 62
+  ret i64 %a
+}
+
+define i64 @or4_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or4_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $63, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 9223372036854775808 ; set bit 63
+  ret i64 %a
+}
+
+define i64 @xor1_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor1_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $31, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 2147483648 ; toggle bit 31
+  ret i64 %a
+}
+
+define i64 @xor2_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor2_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $32, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 4294967296 ; toggle bit 32
+  ret i64 %a
+}
+
+define i64 @xor3_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor3_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $62, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 4611686018427387904 ; toggle bit 62
+  ret i64 %a
+}
+
+define i64 @xor4_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor4_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $63, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 9223372036854775808 ; toggle bit 63
+  ret i64 %a
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
--- a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -50,6 +50,19 @@
   ret i64 %or
 }
 
+define i64 @_Z8lshift11mm_pgso(i64 %a, i64 %b) !prof !14 {
+; CHECK-LABEL: _Z8lshift11mm_pgso:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shldq $11, %rsi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %shl = shl i64 %a, 11
+  %shr = lshr i64 %b, 53
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
 attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 ; clang -O2 -c test2.cpp -emit-llvm -S
@@ -78,3 +91,19 @@
 
 attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll b/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll
--- a/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll
+++ b/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll
@@ -25,6 +25,26 @@
   ret void
 }
 
+define void @f_pgso(i8* %p, i8* %q, i32* inalloca nocapture %unused) !prof !14 {
+entry:
+  %g = alloca %struct.T, align 8
+  %r = alloca i32, align 8
+  store i32 0, i32* %r, align 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %p, i8* align 8 %q, i32 24, i1 false)
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %load = load i32, i32* %r, align 4
+  %dec = add nsw i32 %load, -1
+  store i32 %dec, i32* %r, align 4
+  call void @g(%struct.T* %g)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body
+  ret void
+}
+
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) #1
 
@@ -46,5 +66,38 @@
 ; CHECK:     testb    %[[NE_REG]], %[[NE_REG]]
 ; CHECK:     jne
 
+; CHECK-LABEL: _f_pgso:
+; CHECK:     pushl %ebp
+; CHECK:     movl %esp, %ebp
+; CHECK:     andl $-8, %esp
+; CHECK-NOT: movl %esp, %esi
+; CHECK:     rep;movsl
+; CHECK:     leal 8(%esp), %esi
+
+; CHECK:     decl     (%esp)
+; CHECK:     setne    %[[NE_REG:.*]]
+; CHECK:     pushl     %esi
+; CHECK:     calll     _g
+; CHECK:     addl     $4, %esp
+; CHECK:     testb    %[[NE_REG]], %[[NE_REG]]
+; CHECK:     jne
+
 attributes #0 = { nounwind optsize }
 attributes #1 = { argmemonly nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
--- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
@@ -152,6 +152,30 @@
   br label %fallthrough
 }
 
+; Negative test - opt for size
+define void @test6_pgso(i1 %cond, i64* %base) !prof !14 {
+; CHECK-LABEL: @test6
+entry:
+; CHECK: %addr = getelementptr
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK-NOT: getelementptr inbounds i8, {{.+}} 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+  call void @slowpath(i32 %v1, i32* %casted) cold
+  br label %fallthrough
+}
 
 ; Make sure sinking two copies of addressing mode into different blocks works
 ; when there are cold paths for each.
@@ -278,3 +302,20 @@
   store i1 false, i1* %G23
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}