diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -40,12 +40,14 @@
 }
 
 class AssumptionCache;
+class BlockFrequencyInfo;
 class BranchInst;
 class Function;
 class GlobalValue;
 class IntrinsicInst;
 class LoadInst;
 class Loop;
+class ProfileSummaryInfo;
 class SCEV;
 class ScalarEvolution;
 class StoreInst;
@@ -297,7 +299,9 @@
   /// \p JTSize Set a jump table size only when \p SI is suitable for a jump
   /// table.
   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
-                                            unsigned &JTSize) const;
+                                            unsigned &JTSize,
+                                            ProfileSummaryInfo *PSI,
+                                            BlockFrequencyInfo *BFI) const;
 
   /// Estimate the cost of a given IR user when lowered.
   ///
@@ -1163,7 +1167,9 @@
                                const User *U) = 0;
   virtual int getMemcpyCost(const Instruction *I) = 0;
   virtual unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
-                                                    unsigned &JTSize) = 0;
+                                                    unsigned &JTSize,
+                                                    ProfileSummaryInfo *PSI,
+                                                    BlockFrequencyInfo *BFI) = 0;
   virtual int
   getUserCost(const User *U, ArrayRef<const Value *> Operands) = 0;
   virtual bool hasBranchDivergence() = 0;
@@ -1656,8 +1662,10 @@
     return Impl.getMaxInterleaveFactor(VF);
   }
   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
-                                            unsigned &JTSize) override {
-    return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize);
+                                            unsigned &JTSize,
+                                            ProfileSummaryInfo *PSI,
+                                            BlockFrequencyInfo *BFI) override {
+    return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI);
   }
   unsigned
   getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -114,7 +114,11 @@
   }
 
   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
-                                            unsigned &JTSize) {
+                                            unsigned &JTSize,
+                                            ProfileSummaryInfo *PSI,
+                                            BlockFrequencyInfo *BFI) {
+    (void)PSI;
+    (void)BFI;
     JTSize = 0;
     return SI.getNumCases();
   }
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -48,6 +48,7 @@
 class GlobalValue;
 class GlobalVariable;
 class MachineBasicBlock;
+class MachineBlockFrequencyInfo;
 class MachineConstantPoolValue;
 class MachineDominatorTree;
 class MachineFunction;
@@ -69,6 +70,7 @@
 class MCTargetOptions;
 class MDNode;
 class Module;
+class ProfileSummaryInfo;
 class raw_ostream;
 class StackMaps;
 class TargetLoweringObjectFile;
@@ -107,6 +109,10 @@
   /// Optimization remark emitter.
   MachineOptimizationRemarkEmitter *ORE;
 
+  MachineBlockFrequencyInfo *MBFI;
+
+  ProfileSummaryInfo *PSI;
+
   /// The symbol for the current function. This is recalculated at the beginning
   /// of each call to runOnMachineFunction().
   MCSymbol *CurrentFnSym = nullptr;
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -326,7 +326,9 @@
   }
 
   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
-                                            unsigned &JumpTableSize) {
+                                            unsigned &JumpTableSize,
+                                            ProfileSummaryInfo *PSI,
+                                            BlockFrequencyInfo *BFI) {
     /// Try to find the estimated number of clusters. Note that the number of
     /// clusters identified in this function could be different from the actual
     /// numbers found in lowering. This function ignore switches that are
@@ -374,7 +376,7 @@
           (MaxCaseVal - MinCaseVal)
               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
       // Check whether a range of clusters is dense enough for a jump table
-      if (TLI->isSuitableForJumpTable(&SI, N, Range)) {
+      if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
         JumpTableSize = Range;
         return 1;
       }
diff --git a/llvm/include/llvm/CodeGen/ExecutionDomainFix.h b/llvm/include/llvm/CodeGen/ExecutionDomainFix.h
--- a/llvm/include/llvm/CodeGen/ExecutionDomainFix.h
+++ b/llvm/include/llvm/CodeGen/ExecutionDomainFix.h
@@ -23,6 +23,8 @@
 #define LLVM_CODEGEN_EXECUTIONDOMAINFIX_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/LoopTraversal.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/ReachingDefAnalysis.h"
@@ -126,6 +128,9 @@
 
   ReachingDefAnalysis *RDA;
 
+  ProfileSummaryInfo *PSI;
+  MachineBlockFrequencyInfo *MBFI;
+
 public:
   ExecutionDomainFix(char &PassID, const TargetRegisterClass &RC)
       : MachineFunctionPass(PassID), RC(&RC), NumRegs(RC.getNumRegs()) {}
@@ -133,6 +138,8 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
     AU.addRequired<ReachingDefAnalysis>();
+    AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
--- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
@@ -34,6 +34,7 @@
 namespace llvm {
 
 class LiveIntervals;
+class ProfileSummaryInfo;
 class MachineBlockFrequencyInfo;
 class MachineInstr;
 class MachineLoopInfo;
@@ -103,14 +104,17 @@
 
   /// foldAsLoad - If LI has a single use and a single def that can be folded as
   /// a load, eliminate the register by folding the def into the use.
-  bool foldAsLoad(LiveInterval *LI, SmallVectorImpl<MachineInstr *> &Dead);
+  bool foldAsLoad(LiveInterval *LI, SmallVectorImpl<MachineInstr *> &Dead,
+                  ProfileSummaryInfo *PSI,
+                  const MachineBlockFrequencyInfo *MBFI);
 
   using ToShrinkSet = SetVector<LiveInterval *, SmallVector<LiveInterval *, 8>,
                                 SmallPtrSet<LiveInterval *, 8>>;
 
   /// Helper for eliminateDeadDefs.
   void eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
-                        AliasAnalysis *AA);
+                        AliasAnalysis *AA, ProfileSummaryInfo *PSI,
+                        const MachineBlockFrequencyInfo *MBFI);
 
   /// MachineRegisterInfo callback to notify when new virtual
   /// registers are created.
@@ -243,6 +247,8 @@
   /// allocator.  These registers should not be split into new intervals
   /// as currently those new intervals are not guaranteed to spill.
   void eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
+                         ProfileSummaryInfo *PSI,
+                         const MachineBlockFrequencyInfo *MBFI,
                          ArrayRef<unsigned> RegsBeingSpilled = None,
                          AliasAnalysis *AA = nullptr);
 
diff --git a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
--- a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
@@ -38,6 +38,9 @@
   static char ID;
 
   MachineBlockFrequencyInfo();
+  explicit MachineBlockFrequencyInfo(MachineFunction &F,
+                                     MachineBranchProbabilityInfo &MBPI,
+                                     MachineLoopInfo &MLI);
   ~MachineBlockFrequencyInfo() override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
diff --git a/llvm/include/llvm/CodeGen/MachineDominators.h b/llvm/include/llvm/CodeGen/MachineDominators.h
--- a/llvm/include/llvm/CodeGen/MachineDominators.h
+++ b/llvm/include/llvm/CodeGen/MachineDominators.h
@@ -81,6 +81,9 @@
   static char ID; // Pass ID, replacement for typeid
 
   MachineDominatorTree();
+  explicit MachineDominatorTree(MachineFunction &MF) : MachineFunctionPass(ID) {
+    calculate(MF);
+  }
 
   DomTreeT &getBase() {
     if (!DT) DT.reset(new DomTreeT());
@@ -111,6 +114,8 @@
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
+  void calculate(MachineFunction &F);
+
   bool dominates(const MachineDomTreeNode *A,
                  const MachineDomTreeNode *B) const {
     applySplitCriticalEdges();
diff --git a/llvm/include/llvm/CodeGen/MachineLoopInfo.h b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
--- a/llvm/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
@@ -37,6 +37,7 @@
 
 namespace llvm {
 
+class MachineDominatorTree;
 // Implementation in LoopInfoImpl.h
 class MachineLoop;
 extern template class LoopBase<MachineBasicBlock, MachineLoop>;
@@ -91,6 +92,10 @@
   MachineLoopInfo() : MachineFunctionPass(ID) {
     initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
   }
+  explicit MachineLoopInfo(MachineDominatorTree &MDT)
+      : MachineFunctionPass(ID) {
+    calculate(MDT);
+  }
   MachineLoopInfo(const MachineLoopInfo &) = delete;
   MachineLoopInfo &operator=(const MachineLoopInfo &) = delete;
 
@@ -133,6 +138,7 @@
 
   /// Calculate the natural loop information.
   bool runOnMachineFunction(MachineFunction &F) override;
+  void calculate(MachineDominatorTree &MDT);
 
   void releaseMemory() override { LI.releaseMemory(); }
 
diff --git a/llvm/include/llvm/CodeGen/MachineSizeOpts.h b/llvm/include/llvm/CodeGen/MachineSizeOpts.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MachineSizeOpts.h
@@ -0,0 +1,37 @@
+//===- MachineSizeOpts.h - machine size optimization ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some shared machine IR code size optimization related
+// code.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_CODEGEN_MACHINE_SIZEOPTS_H
+#define LLVM_CODEGEN_MACHINE_SIZEOPTS_H
+
+#include "llvm/Transforms/Utils/SizeOpts.h"
+
+namespace llvm {
+
+class ProfileSummaryInfo;
+class MachineBasicBlock;
+class MachineBlockFrequencyInfo;
+class MachineFunction;
+
+/// Returns true if machine function \p MF is suggested to be size-optimized
+/// base on the profile.
+bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI,
+                           const MachineBlockFrequencyInfo *BFI);
+/// Returns true if machine basic block \p MBB is suggested to be size-optimized
+/// base on the profile.
+bool shouldOptimizeForSize(const MachineBasicBlock *MBB,
+                           ProfileSummaryInfo *PSI,
+                           const MachineBlockFrequencyInfo *MBFI);
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_MACHINE_SIZEOPTS_H
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -59,6 +59,7 @@
 namespace llvm {
 
 class BlockAddress;
+class BlockFrequencyInfo;
 class Constant;
 class ConstantFP;
 class ConstantInt;
@@ -71,6 +72,7 @@
 class MachineConstantPoolValue;
 class MCSymbol;
 class OptimizationRemarkEmitter;
+class ProfileSummaryInfo;
 class SDDbgValue;
 class SDDbgLabel;
 class SelectionDAG;
@@ -235,6 +237,9 @@
   /// whenever manipulating the DAG.
   OptimizationRemarkEmitter *ORE;
 
+  ProfileSummaryInfo *PSI;
+  BlockFrequencyInfo *BFI;
+
   /// The starting token.
   SDNode EntryNode;
 
@@ -401,7 +406,8 @@
   /// Prepare this SelectionDAG to process code in the given MachineFunction.
   void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
             Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
-            LegacyDivergenceAnalysis * Divergence);
+            LegacyDivergenceAnalysis * Divergence,
+            ProfileSummaryInfo *PSIin, BlockFrequencyInfo *BFIin);
 
   void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) {
     FLI = FuncInfo;
@@ -423,6 +429,8 @@
   const LegacyDivergenceAnalysis *getDivergenceAnalysis() const { return DA; }
   LLVMContext *getContext() const {return Context; }
   OptimizationRemarkEmitter &getORE() const { return *ORE; }
+  ProfileSummaryInfo *getPSI() const { return PSI; }
+  BlockFrequencyInfo *getBFI() const { return BFI; }
 
   /// Pop up a GraphViz/gv window with the DAG rendered using 'dot'.
   void viewGraph(const std::string &Title);
@@ -1697,6 +1705,8 @@
     return It->second.HeapAllocSite;
   }
 
+  bool shouldOptForSize() const;
+
 private:
   void InsertNode(SDNode *N);
   bool RemoveNodeFromCSEMaps(SDNode *N);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
--- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h
@@ -38,6 +38,8 @@
   class GCFunctionInfo;
   class ScheduleDAGSDNodes;
   class LoadInst;
+  class ProfileSummaryInfo;
+  class BlockFrequencyInfo;
 
 /// SelectionDAGISel - This is the common base class used for SelectionDAG-based
 /// pattern-matching instruction selectors.
@@ -248,6 +250,11 @@
   virtual StringRef getIncludePathForIndex(unsigned index) {
     llvm_unreachable("Tblgen should generate the implementation of this!");
   }
+
+  bool shouldOptForSize(const MachineFunction *MF) const {
+    return CurDAG->shouldOptForSize();
+  }
+
 public:
   // Calls to these predicates are generated by tblgen.
   bool CheckAndMask(SDValue LHS, ConstantSDNode *RHS,
diff --git a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h
--- a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h
+++ b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h
@@ -19,6 +19,7 @@
 
 class FunctionLoweringInfo;
 class MachineBasicBlock;
+class BlockFrequencyInfo;
 
 namespace SwitchCG {
 
@@ -264,7 +265,8 @@
   std::vector<BitTestBlock> BitTestCases;
 
   void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI,
-                      MachineBasicBlock *DefaultMBB);
+                      MachineBasicBlock *DefaultMBB,
+                      ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI);
 
   bool buildJumpTable(const CaseClusterVector &Clusters, unsigned First,
                       unsigned Last, const SwitchInst *SI,
@@ -295,4 +297,3 @@
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_SWITCHLOWERINGUTILS_H
-
diff --git a/llvm/include/llvm/CodeGen/TailDuplicator.h b/llvm/include/llvm/CodeGen/TailDuplicator.h
--- a/llvm/include/llvm/CodeGen/TailDuplicator.h
+++ b/llvm/include/llvm/CodeGen/TailDuplicator.h
@@ -25,11 +25,13 @@
 namespace llvm {
 
 class MachineBasicBlock;
+class MachineBlockFrequencyInfo;
 class MachineBranchProbabilityInfo;
 class MachineFunction;
 class MachineInstr;
 class MachineModuleInfo;
 class MachineRegisterInfo;
+class ProfileSummaryInfo;
 class TargetRegisterInfo;
 
 /// Utility class to perform tail duplication.
@@ -40,6 +42,8 @@
   const MachineModuleInfo *MMI;
   MachineRegisterInfo *MRI;
   MachineFunction *MF;
+  const MachineBlockFrequencyInfo *MBFI;
+  ProfileSummaryInfo *PSI;
   bool PreRegAlloc;
   bool LayoutMode;
   unsigned TailDupSize;
@@ -65,6 +69,8 @@
   ///     default implies using the command line value TailDupSize.
   void initMF(MachineFunction &MF, bool PreRegAlloc,
               const MachineBranchProbabilityInfo *MBPI,
+              const MachineBlockFrequencyInfo *MBFI,
+              ProfileSummaryInfo *PSI,
               bool LayoutMode, unsigned TailDupSize = 0);
 
   bool tailDuplicateBlocks();
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -43,12 +43,14 @@
 class InstrItineraryData;
 class LiveIntervals;
 class LiveVariables;
+class MachineBlockFrequencyInfo;
 class MachineMemOperand;
 class MachineRegisterInfo;
 class MCAsmInfo;
 class MCInst;
 struct MCSchedModel;
 class Module;
+class ProfileSummaryInfo;
 class ScheduleDAG;
 class ScheduleHazardRecognizer;
 class SDNode;
@@ -130,7 +132,10 @@
   /// Do not call this method for a non-commutable instruction.
   /// Even though the instruction is commutable, the method may still
   /// fail to commute the operands, null pointer is returned in such cases.
-  virtual MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+  virtual MachineInstr *commuteInstructionImpl(MachineInstr &MI,
+                                               ProfileSummaryInfo *PSI,
+                                               const MachineBlockFrequencyInfo *MBFI,
+                                               bool NewMI,
                                                unsigned OpIdx1,
                                                unsigned OpIdx2) const;
 
@@ -401,7 +406,9 @@
   /// Even though the instruction is commutable, the method may still
   /// fail to commute the operands, null pointer is returned in such cases.
   MachineInstr *
-  commuteInstruction(MachineInstr &MI, bool NewMI = false,
+  commuteInstruction(MachineInstr &MI, ProfileSummaryInfo *PSI,
+                     const MachineBlockFrequencyInfo *MBFI,
+                     bool NewMI = false,
                      unsigned OpIdx1 = CommuteAnyOperandIndex,
                      unsigned OpIdx2 = CommuteAnyOperandIndex) const;
 
@@ -1000,6 +1007,8 @@
   /// decide on using an opcode (note that those assignments can still change).
   MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
                                   int FI,
+                                  ProfileSummaryInfo *PSI,
+                                  const MachineBlockFrequencyInfo *MBFI,
                                   LiveIntervals *LIS = nullptr,
                                   VirtRegMap *VRM = nullptr) const;
 
@@ -1007,6 +1016,8 @@
   /// store from / to any address, not just from a specific stack slot.
   MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
                                   MachineInstr &LoadMI,
+                                  ProfileSummaryInfo *PSI,
+                                  const MachineBlockFrequencyInfo *MBFI,
                                   LiveIntervals *LIS = nullptr) const;
 
   /// Return true when there is potentially a faster code sequence
@@ -1091,6 +1102,8 @@
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                         ArrayRef<unsigned> Ops,
                         MachineBasicBlock::iterator InsertPt, int FrameIndex,
+                        ProfileSummaryInfo *PSI,
+                        const MachineBlockFrequencyInfo *MBFI,
                         LiveIntervals *LIS = nullptr,
                         VirtRegMap *VRM = nullptr) const {
     return nullptr;
@@ -1104,6 +1117,7 @@
   virtual MachineInstr *foldMemoryOperandImpl(
       MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
       MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+      ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI,
       LiveIntervals *LIS = nullptr) const {
     return nullptr;
   }
@@ -1390,7 +1404,9 @@
   virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI,
                                           const MachineRegisterInfo *MRI,
                                           unsigned &FoldAsLoadDefReg,
-                                          MachineInstr *&DefMI) const {
+                                          MachineInstr *&DefMI,
+                                          ProfileSummaryInfo *PSI,
+                                          const MachineBlockFrequencyInfo *MBFI) const {
     return nullptr;
   }
 
@@ -1512,7 +1528,9 @@
   ///
   /// The bit (1 << Domain) must be set in the mask returned from
   /// getExecutionDomain(MI).
-  virtual void setExecutionDomain(MachineInstr &MI, unsigned Domain) const {}
+  virtual void setExecutionDomain(MachineInstr &MI, unsigned Domain,
+                                  ProfileSummaryInfo *PSI,
+                                  const MachineBlockFrequencyInfo *MBFI) const {}
 
   /// Returns the preferred minimum clearance
   /// before an instruction with an unwanted partial register update.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -29,6 +29,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -54,6 +55,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 #include <algorithm>
 #include <cassert>
 #include <climits>
@@ -1029,13 +1031,16 @@
   /// Return true if lowering to a jump table is suitable for a set of case
   /// clusters which may contain \p NumCases cases, \p Range range of values.
   virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases,
-                                      uint64_t Range) const {
+                                      uint64_t Range, ProfileSummaryInfo* PSI,
+                                      BlockFrequencyInfo *BFI) const {
     // FIXME: This function check the maximum table size and density, but the
     // minimum size is not checked. It would be nice if the minimum size is
     // also combined within this function. Currently, the minimum size check is
     // performed in findJumpTable() in SelectionDAGBuiler and
     // getEstimatedNumberOfCaseClusters() in BasicTTIImpl.
-    const bool OptForSize = SI->getParent()->getParent()->hasOptSize();
+    const bool OptForSize = SI->getParent()->getParent()->hasOptSize() ||
+                            llvm::shouldOptimizeForSize(SI->getParent(), PSI,
+                                                        BFI);
     const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize);
     const unsigned MaxJumpTableSize = getMaximumJumpTableSize();
     
diff --git a/llvm/include/llvm/Transforms/Utils/SizeOpts.h b/llvm/include/llvm/Transforms/Utils/SizeOpts.h
--- a/llvm/include/llvm/Transforms/Utils/SizeOpts.h
+++ b/llvm/include/llvm/Transforms/Utils/SizeOpts.h
@@ -13,6 +13,18 @@
 #ifndef LLVM_TRANSFORMS_UTILS_SIZEOPTS_H
 #define LLVM_TRANSFORMS_UTILS_SIZEOPTS_H
 
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+extern cl::opt<bool> EnablePGSO;
+extern cl::opt<bool> PGSOLargeWorkingSetSizeOnly;
+extern cl::opt<bool> ForcePGSO;
+extern cl::opt<int> PgsoCutoffInstrProf;
+extern cl::opt<int> PgsoCutoffSampleProf;
+
 namespace llvm {
 
 class BasicBlock;
@@ -20,13 +32,52 @@
 class Function;
 class ProfileSummaryInfo;
 
+template<typename AdapterT, typename FuncT, typename BFIT>
+bool shouldFuncOptimizeForSizeImpl(const FuncT *F, ProfileSummaryInfo *PSI,
+                                   BFIT *BFI) {
+  assert(F);
+  if (!PSI || !BFI || !PSI->hasProfileSummary())
+    return false;
+  if (ForcePGSO)
+    return true;
+  if (!EnablePGSO)
+    return false;
+  if (PGSOLargeWorkingSetSizeOnly && !PSI->hasLargeWorkingSetSize()) {
+    // Even if the working set size isn't large, size-optimize cold code.
+    return AdapterT::isFunctionColdInCallGraph(F, PSI, *BFI);
+  }
+  return !AdapterT::isFunctionHotInCallGraphNthPercentile(
+      PSI->hasSampleProfile() ? PgsoCutoffSampleProf : PgsoCutoffInstrProf,
+      F, PSI, *BFI);
+}
+
+template<typename AdapterT, typename BlockT, typename BFIT>
+bool shouldOptimizeForSizeImpl(const BlockT *BB, ProfileSummaryInfo *PSI,
+                               BFIT *BFI) {
+  assert(BB);
+  if (!PSI || !BFI || !PSI->hasProfileSummary())
+    return false;
+  if (ForcePGSO)
+    return true;
+  if (!EnablePGSO)
+    return false;
+  if (PGSOLargeWorkingSetSizeOnly && !PSI->hasLargeWorkingSetSize()) {
+    // Even if the working set size isn't large, size-optimize cold code.
+    return AdapterT::isColdBlock(BB, PSI, BFI);
+  }
+  return !AdapterT::isHotBlockNthPercentile(
+      PSI->hasSampleProfile() ? PgsoCutoffSampleProf : PgsoCutoffInstrProf,
+      BB, PSI, BFI);
+}
+
 /// Returns true if function \p F is suggested to be size-optimized base on the
 /// profile.
-bool shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
+bool shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI,
                            BlockFrequencyInfo *BFI);
+
 /// Returns true if basic block \p BB is suggested to be size-optimized base
 /// on the profile.
-bool shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI,
+bool shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI,
                            BlockFrequencyInfo *BFI);
 
 } // end namespace llvm
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -1456,8 +1456,9 @@
   int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
 
   unsigned JumpTableSize = 0;
+  BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr;
   unsigned NumCaseCluster =
-      TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
+      TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
 
   // If suitable for a jump table, consider the cost for the table size and
   // branch to destination.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -194,9 +194,10 @@
 }
 
 unsigned
-TargetTransformInfo::getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
-                                                      unsigned &JTSize) const {
-  return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize);
+TargetTransformInfo::getEstimatedNumberOfCaseClusters(
+    const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI) const {
+  return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI);
 }
 
 int TargetTransformInfo::getUserCost(const User *U,
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -31,13 +31,16 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -52,6 +55,7 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -253,6 +257,8 @@
   AU.addRequired<MachineModuleInfoWrapperPass>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
+  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
@@ -1691,6 +1697,10 @@
   }
 
   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MBFI = (PSI && PSI->hasProfileSummary()) ?
+         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+         nullptr;
 }
 
 namespace {
@@ -2909,8 +2919,10 @@
 void AsmPrinter::setupCodePaddingContext(const MachineBasicBlock &MBB,
                                          MCCodePaddingContext &Context) const {
   assert(MF != nullptr && "Machine function must be valid");
+  bool OptForSize = MF->getFunction().hasOptSize() ||
+                    llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
   Context.IsPaddingActive = !MF->hasInlineAsm() &&
-                            !MF->getFunction().hasOptSize() &&
+                            !OptForSize &&
                             TM.getOptLevel() != CodeGenOpt::None;
   Context.IsBasicBlockReachableViaFallthrough =
       std::find(MBB.pred_begin(), MBB.pred_end(), MBB.getPrevNode()) !=
diff --git a/llvm/lib/CodeGen/BranchFolding.h b/llvm/lib/CodeGen/BranchFolding.h
--- a/llvm/lib/CodeGen/BranchFolding.h
+++ b/llvm/lib/CodeGen/BranchFolding.h
@@ -27,6 +27,7 @@
 class MachineLoopInfo;
 class MachineModuleInfo;
 class MachineRegisterInfo;
+class ProfileSummaryInfo;
 class raw_ostream;
 class TargetInstrInfo;
 class TargetRegisterInfo;
@@ -39,6 +40,7 @@
                           bool CommonHoist,
                           MBFIWrapper &FreqInfo,
                           const MachineBranchProbabilityInfo &ProbInfo,
+                          ProfileSummaryInfo *PSI,
                           // Min tail length to merge. Defaults to commandline
                           // flag. Ignored for optsize.
                           unsigned MinTailLength = 0);
@@ -145,6 +147,7 @@
                                   const BlockFrequency Freq) const;
       void view(const Twine &Name, bool isSimple = true);
       uint64_t getEntryFreq() const;
+      const MachineBlockFrequencyInfo &getMBFI() { return MBFI; }
 
     private:
       const MachineBlockFrequencyInfo &MBFI;
@@ -154,6 +157,7 @@
   private:
     MBFIWrapper &MBBFreqInfo;
     const MachineBranchProbabilityInfo &MBPI;
+    ProfileSummaryInfo *PSI;
 
     bool TailMergeBlocks(MachineFunction &MF);
     bool TryTailMergeBlocks(MachineBasicBlock* SuccBB,
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -38,6 +39,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -102,6 +104,7 @@
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBlockFrequencyInfo>();
       AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<ProfileSummaryInfoWrapperPass>();
       AU.addRequired<TargetPassConfig>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -128,7 +131,8 @@
   BranchFolder::MBFIWrapper MBBFreqInfo(
       getAnalysis<MachineBlockFrequencyInfo>());
   BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo,
-                      getAnalysis<MachineBranchProbabilityInfo>());
+                      getAnalysis<MachineBranchProbabilityInfo>(),
+                      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
   auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
   return Folder.OptimizeFunction(
       MF, MF.getSubtarget().getInstrInfo(), MF.getSubtarget().getRegisterInfo(),
@@ -138,9 +142,10 @@
 BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
                            MBFIWrapper &FreqInfo,
                            const MachineBranchProbabilityInfo &ProbInfo,
+                           ProfileSummaryInfo *PSI,
                            unsigned MinTailLength)
     : EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength),
-      MBBFreqInfo(FreqInfo), MBPI(ProbInfo) {
+      MBBFreqInfo(FreqInfo), MBPI(ProbInfo), PSI(PSI) {
   if (MinCommonTailLength == 0)
     MinCommonTailLength = TailMergeSize;
   switch (FlagEnableTailMerge) {
@@ -641,7 +646,9 @@
                   MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB,
                   MachineBasicBlock *PredBB,
                   DenseMap<const MachineBasicBlock *, int> &EHScopeMembership,
-                  bool AfterPlacement) {
+                  bool AfterPlacement,
+                  BranchFolder::MBFIWrapper &MBBFreqInfo,
+                  ProfileSummaryInfo *PSI) {
   // It is never profitable to tail-merge blocks from two different EH scopes.
   if (!EHScopeMembership.empty()) {
     auto EHScope1 = EHScopeMembership.find(MBB1);
@@ -727,7 +734,11 @@
   // branch instruction, which is likely to be smaller than the 2
   // instructions that would be deleted in the merge.
   MachineFunction *MF = MBB1->getParent();
-  return EffectiveTailLen >= 2 && MF->getFunction().hasOptSize() &&
+  bool OptForSize =
+      MF->getFunction().hasOptSize() ||
+      (llvm::shouldOptimizeForSize(MBB1, PSI, &MBBFreqInfo.getMBFI()) &&
+       llvm::shouldOptimizeForSize(MBB2, PSI, &MBBFreqInfo.getMBFI()));
+  return EffectiveTailLen >= 2 && OptForSize &&
          (I1 == MBB1->begin() || I2 == MBB2->begin());
 }
 
@@ -749,7 +760,7 @@
                             CommonTailLen, TrialBBI1, TrialBBI2,
                             SuccBB, PredBB,
                             EHScopeMembership,
-                            AfterBlockPlacement)) {
+                            AfterBlockPlacement, MBBFreqInfo, PSI)) {
         if (CommonTailLen > maxCommonTailLength) {
           SameTails.clear();
           maxCommonTailLength = CommonTailLen;
@@ -1579,8 +1590,10 @@
     }
   }
 
-  if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 &&
-      MF.getFunction().hasOptSize()) {
+  bool OptForSize =
+      MF.getFunction().hasOptSize() ||
+      llvm::shouldOptimizeForSize(MBB, PSI, &MBBFreqInfo.getMBFI());
+  if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && OptForSize) {
     // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch
     // direction, thereby defeating careful block placement and regressing
     // performance. Therefore, only consider this for optsize functions.
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -92,6 +92,7 @@
   MachineRegisterInfo.cpp
   MachineScheduler.cpp
   MachineSink.cpp
+  MachineSizeOpts.cpp
   MachineSSAUpdater.cpp
   MachineTraceMetrics.cpp
   MachineVerifier.cpp
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -89,6 +89,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -251,6 +252,7 @@
     const LoopInfo *LI;
     std::unique_ptr<BlockFrequencyInfo> BFI;
     std::unique_ptr<BranchProbabilityInfo> BPI;
+    ProfileSummaryInfo *PSI;
 
     /// As we scan instructions optimizing them, this is the next instruction
     /// to optimize. Transforms that can invalidate this should update it.
@@ -293,7 +295,7 @@
     /// Keep track of SExt promoted.
     ValueToSExts ValToSExtendedUses;
 
-    /// True if optimizing for size.
+    /// True if the function has the OptSize attribute.
     bool OptSize;
 
     /// DataLayout for the Function being processed.
@@ -429,10 +431,8 @@
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   BPI.reset(new BranchProbabilityInfo(F, *LI));
   BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   OptSize = F.hasOptSize();
-
-  ProfileSummaryInfo *PSI =
-      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   if (ProfileGuidedSectionPrefix) {
     if (PSI->isFunctionHotInCallGraph(&F, *BFI))
       F.setSectionPrefix(".hot");
@@ -451,7 +451,9 @@
       // bypassSlowDivision may create new BBs, but we don't want to reapply the
       // optimization to those blocks.
       BasicBlock* Next = BB->getNextNode();
-      EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
+      // F.hasOptSize is already checked in the outer if statement.
+      if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
+        EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
       BB = Next;
     }
   }
@@ -1842,7 +1844,8 @@
   // cold block.  This interacts with our handling for loads and stores to
   // ensure that we can fold all uses of a potential addressing computation
   // into their uses.  TODO: generalize this to work over profiling data
-  if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+  bool OptForSize = OptSize || llvm::shouldOptimizeForSize(BB, PSI, BFI.get());
+  if (!OptForSize && CI->hasFnAttr(Attribute::Cold))
     for (auto &Arg : CI->arg_operands()) {
       if (!Arg->getType()->isPointerTy())
         continue;
@@ -2791,16 +2794,24 @@
   /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
   bool IgnoreProfitability;
 
+  /// True if we are optimizing for size.
+  bool OptSize;
+
+  ProfileSummaryInfo *PSI;
+  BlockFrequencyInfo *BFI;
+
   AddressingModeMatcher(
       SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
       const TargetRegisterInfo &TRI, Type *AT, unsigned AS, Instruction *MI,
       ExtAddrMode &AM, const SetOfInstrs &InsertedInsts,
       InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
-      std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP)
+      std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
+      bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
       : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
         DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
         MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
-        PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP) {
+        PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP),
+        OptSize(OptSize), PSI(PSI), BFI(BFI) {
     IgnoreProfitability = false;
   }
 
@@ -2818,12 +2829,14 @@
         const TargetLowering &TLI, const TargetRegisterInfo &TRI,
         const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
         TypePromotionTransaction &TPT,
-        std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP) {
+        std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
+        bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
     ExtAddrMode Result;
 
     bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS,
                                          MemoryInst, Result, InsertedInsts,
-                                         PromotedInsts, TPT, LargeOffsetGEP)
+                                         PromotedInsts, TPT, LargeOffsetGEP,
+                                         OptSize, PSI, BFI)
                        .matchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
     return Result;
@@ -4434,7 +4447,8 @@
     Instruction *I,
     SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
     SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
-    const TargetRegisterInfo &TRI, int SeenInsts = 0) {
+    const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI, int SeenInsts = 0) {
   // If we already considered this instruction, we're done.
   if (!ConsideredInsts.insert(I).second)
     return false;
@@ -4443,8 +4457,6 @@
   if (!MightBeFoldableInst(I))
     return true;
 
-  const bool OptSize = I->getFunction()->hasOptSize();
-
   // Loop over all the uses, recursively processing them.
   for (Use &U : I->uses()) {
     // Conservatively return true if we're seeing a large number or a deep chain
@@ -4485,7 +4497,9 @@
     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
       // If this is a cold call, we can sink the addressing calculation into
       // the cold path.  See optimizeCallInst
-      if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+      bool OptForSize = OptSize ||
+          llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
+      if (!OptForSize && CI->hasFnAttr(Attribute::Cold))
         continue;
 
       InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
@@ -4497,8 +4511,8 @@
       continue;
     }
 
-    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI,
-                          SeenInsts))
+    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
+                          PSI, BFI, SeenInsts))
       return true;
   }
 
@@ -4586,7 +4600,8 @@
   // the use is just a particularly nice way of sinking it.
   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
   SmallPtrSet<Instruction*, 16> ConsideredInsts;
-  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI))
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
+                        PSI, BFI))
     return false;  // Has a non-memory, non-foldable use!
 
   // Now that we know that all uses of this instruction are part of a chain of
@@ -4622,7 +4637,7 @@
         TPT.getRestorationPoint();
     AddressingModeMatcher Matcher(
         MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result,
-        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP);
+        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI);
     Matcher.IgnoreProfitability = true;
     bool Success = Matcher.matchAddr(Address, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -4728,7 +4743,8 @@
                                                                       0);
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
         V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
-        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP);
+        InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
+        BFI.get());
 
     GetElementPtrInst *GEP = LargeOffsetGEP.first;
     if (GEP && !NewGEPBases.count(GEP)) {
@@ -5946,7 +5962,9 @@
 /// turn it into a branch.
 bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   // If branch conversion isn't desirable, exit early.
-  if (DisableSelectToBranch || OptSize || !TLI)
+  if (DisableSelectToBranch ||
+      OptSize || llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()) ||
+      !TLI)
     return false;
 
   // Find all consecutive select instructions that share the same condition.
diff --git a/llvm/lib/CodeGen/ExecutionDomainFix.cpp b/llvm/lib/CodeGen/ExecutionDomainFix.cpp
--- a/llvm/lib/CodeGen/ExecutionDomainFix.cpp
+++ b/llvm/lib/CodeGen/ExecutionDomainFix.cpp
@@ -113,7 +113,7 @@
 
   // Collapse all the instructions.
   while (!dv->Instrs.empty())
-    TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain);
+    TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain, PSI, MBFI);
   dv->setSingleDomain(domain);
 
   // If there are multiple users, give them new, unique DomainValues.
@@ -318,7 +318,7 @@
   // If the collapsed operands force a single domain, propagate the collapse.
   if (isPowerOf2_32(available)) {
     unsigned domain = countTrailingZeros(available);
-    TII->setExecutionDomain(*mi, domain);
+    TII->setExecutionDomain(*mi, domain, PSI, MBFI);
     visitHardInstr(mi, domain);
     return;
   }
@@ -436,6 +436,11 @@
 
   RDA = &getAnalysis<ReachingDefAnalysis>();
 
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MBFI = (PSI && PSI->hasProfileSummary()) ?
+         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+         nullptr;
+
   // Initialize the AliasMap on the first use.
   if (AliasMap.empty()) {
     // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -13,6 +13,8 @@
 
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -20,6 +22,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 
 using namespace llvm;
 
@@ -720,7 +723,8 @@
 ///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
 ///  ret i32 %phi.res
 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
-                         const TargetLowering *TLI, const DataLayout *DL) {
+                         const TargetLowering *TLI, const DataLayout *DL,
+                         ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
   NumMemCmpCalls++;
 
   // Early exit from expansion if -Oz.
@@ -741,18 +745,20 @@
   // TTI call to check if target would like to expand memcmp. Also, get the
   // available load sizes.
   const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
-  auto Options = TTI->enableMemCmpExpansion(CI->getFunction()->hasOptSize(),
+  bool OptForSize = CI->getFunction()->hasOptSize() ||
+                    llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
+  auto Options = TTI->enableMemCmpExpansion(OptForSize,
                                             IsUsedForZeroCmp);
   if (!Options) return false;
 
   if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
     Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
 
-  if (CI->getFunction()->hasOptSize() &&
+  if (OptForSize &&
       MaxLoadsPerMemcmpOptSize.getNumOccurrences())
     Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
 
-  if (!CI->getFunction()->hasOptSize() && MaxLoadsPerMemcmp.getNumOccurrences())
+  if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences())
     Options.MaxNumLoads = MaxLoadsPerMemcmp;
 
   MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL);
@@ -798,7 +804,11 @@
         &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     const TargetTransformInfo *TTI =
         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    auto PA = runImpl(F, TLI, TTI, TL);
+    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+    auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+           &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
+           nullptr;
+    auto PA = runImpl(F, TLI, TTI, TL, PSI, BFI);
     return !PA.areAllPreserved();
   }
 
@@ -806,22 +816,26 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
     FunctionPass::getAnalysisUsage(AU);
   }
 
   PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
                             const TargetTransformInfo *TTI,
-                            const TargetLowering* TL);
+                            const TargetLowering* TL,
+                            ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI);
   // Returns true if a change was made.
   bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
                   const TargetTransformInfo *TTI, const TargetLowering* TL,
-                  const DataLayout& DL);
+                  const DataLayout& DL, ProfileSummaryInfo *PSI,
+                  BlockFrequencyInfo *BFI);
 };
 
 bool ExpandMemCmpPass::runOnBlock(
     BasicBlock &BB, const TargetLibraryInfo *TLI,
     const TargetTransformInfo *TTI, const TargetLowering* TL,
-    const DataLayout& DL) {
+    const DataLayout& DL, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
   for (Instruction& I : BB) {
     CallInst *CI = dyn_cast<CallInst>(&I);
     if (!CI) {
@@ -830,7 +844,7 @@
     LibFunc Func;
     if (TLI->getLibFunc(ImmutableCallSite(CI), Func) &&
         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
-        expandMemCmp(CI, TTI, TL, &DL)) {
+        expandMemCmp(CI, TTI, TL, &DL, PSI, BFI)) {
       return true;
     }
   }
@@ -840,11 +854,12 @@
 
 PreservedAnalyses ExpandMemCmpPass::runImpl(
     Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI,
-    const TargetLowering* TL) {
+    const TargetLowering* TL, ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI) {
   const DataLayout& DL = F.getParent()->getDataLayout();
   bool MadeChanges = false;
   for (auto BBIt = F.begin(); BBIt != F.end();) {
-    if (runOnBlock(*BBIt, TLI, TTI, TL, DL)) {
+    if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI)) {
       MadeChanges = true;
       // If changes were made, restart the function from the beginning, since
       // the structure of the function was changed.
@@ -863,6 +878,8 @@
                       "Expand memcmp() to load/stores", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp",
                     "Expand memcmp() to load/stores", false, false)
 
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -465,7 +465,7 @@
     return true;
   }
 
-  SL->findJumpTables(Clusters, &SI, DefaultMBB);
+  SL->findJumpTables(Clusters, &SI, DefaultMBB, nullptr, nullptr);
 
   LLVM_DEBUG({
     dbgs() << "Case clusters: ";
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -211,6 +212,7 @@
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBlockFrequencyInfo>();
       AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<ProfileSummaryInfoWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -432,6 +434,7 @@
 
 INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false)
 
 bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
@@ -444,6 +447,8 @@
   TRI = ST.getRegisterInfo();
   BranchFolder::MBFIWrapper MBFI(getAnalysis<MachineBlockFrequencyInfo>());
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  ProfileSummaryInfo *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   MRI = &MF.getRegInfo();
   SchedModel.init(&ST);
 
@@ -454,7 +459,7 @@
   bool BFChange = false;
   if (!PreRegAlloc) {
     // Tail merge tend to expose more if-conversion opportunities.
-    BranchFolder BF(true, false, MBFI, *MBPI);
+    BranchFolder BF(true, false, MBFI, *MBPI, PSI);
     auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
     BFChange = BF.OptimizeFunction(
         MF, TII, ST.getRegisterInfo(),
@@ -596,7 +601,7 @@
   BBAnalysis.clear();
 
   if (MadeChange && IfCvtBranchFold) {
-    BranchFolder BF(false, false, MBFI, *MBPI);
+    BranchFolder BF(false, false, MBFI, *MBPI, PSI);
     auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
     BF.OptimizeFunction(
         MF, TII, MF.getSubtarget().getRegisterInfo(),
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
@@ -94,6 +95,7 @@
   const TargetInstrInfo &TII;
   const TargetRegisterInfo &TRI;
   const MachineBlockFrequencyInfo &MBFI;
+  ProfileSummaryInfo *PSI;
 
   InsertPointAnalysis IPA;
 
@@ -146,6 +148,7 @@
         MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
         TRI(*mf.getSubtarget().getRegisterInfo()),
         MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()),
+        PSI(&pass.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI()),
         IPA(LIS, mf.getNumBlockIDs()) {}
 
   void addToMergeableSpills(MachineInstr &Spill, int StackSlot,
@@ -167,6 +170,7 @@
   const TargetInstrInfo &TII;
   const TargetRegisterInfo &TRI;
   const MachineBlockFrequencyInfo &MBFI;
+  ProfileSummaryInfo *PSI;
 
   // Variables that are valid during spill(), but used by multiple methods.
   LiveRangeEdit *Edit;
@@ -202,6 +206,7 @@
         MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
         TRI(*mf.getSubtarget().getRegisterInfo()),
         MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()),
+        PSI(&pass.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI()),
         HSpiller(pass, mf, vrm) {}
 
   void spill(LiveRangeEdit &) override;
@@ -684,7 +689,7 @@
   if (DeadDefs.empty())
     return;
   LLVM_DEBUG(dbgs() << "Remat created " << DeadDefs.size() << " dead defs.\n");
-  Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA);
+  Edit->eliminateDeadDefs(DeadDefs, PSI, &MBFI, RegsToSpill, AA);
 
   // LiveRangeEdit::eliminateDeadDef is used to remove dead define instructions
   // after rematerialization.  To remove a VNI for a vreg from its LiveInterval,
@@ -835,8 +840,9 @@
   MachineInstrSpan MIS(MI, MI->getParent());
 
   MachineInstr *FoldMI =
-      LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, &LIS)
-             : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, &LIS, &VRM);
+      LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, PSI, &MBFI, &LIS)
+             : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, PSI, &MBFI, &LIS,
+                                     &VRM);
   if (!FoldMI)
     return false;
 
@@ -1085,7 +1091,7 @@
   // Hoisted spills may cause dead code.
   if (!DeadDefs.empty()) {
     LLVM_DEBUG(dbgs() << "Eliminating " << DeadDefs.size() << " dead defs\n");
-    Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA);
+    Edit->eliminateDeadDefs(DeadDefs, PSI, &MBFI, RegsToSpill, AA);
   }
 
   // Finally delete the SnippetCopies.
@@ -1524,7 +1530,7 @@
           RMEnt->RemoveOperand(i - 1);
       }
     }
-    Edit.eliminateDeadDefs(SpillsToRm, None, AA);
+    Edit.eliminateDeadDefs(SpillsToRm, PSI, &MBFI, None, AA);
   }
 }
 
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -12,8 +12,10 @@
 
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -183,7 +185,9 @@
 }
 
 bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
-                               SmallVectorImpl<MachineInstr*> &Dead) {
+                               SmallVectorImpl<MachineInstr*> &Dead,
+                               ProfileSummaryInfo *PSI,
+                               const MachineBlockFrequencyInfo *MBFI) {
   MachineInstr *DefMI = nullptr, *UseMI = nullptr;
 
   // Check that there is a single def and a single use.
@@ -226,7 +230,8 @@
   if (UseMI->readsWritesVirtualRegister(LI->reg, &Ops).second)
     return false;
 
-  MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, &LIS);
+  MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI,
+                                               PSI, MBFI, &LIS);
   if (!FoldMI)
     return false;
   LLVM_DEBUG(dbgs() << "                folded: " << *FoldMI);
@@ -258,7 +263,8 @@
 
 /// Find all live intervals that need to shrink, then remove the instruction.
 void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
-                                     AliasAnalysis *AA) {
+                                     AliasAnalysis *AA, ProfileSummaryInfo *PSI,
+                                     const MachineBlockFrequencyInfo *MBFI) {
   assert(MI->allDefsAreDead() && "Def isn't really dead");
   SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
 
@@ -390,6 +396,8 @@
 }
 
 void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
+                                      ProfileSummaryInfo *PSI,
+                                      const MachineBlockFrequencyInfo *MBFI,
                                       ArrayRef<unsigned> RegsBeingSpilled,
                                       AliasAnalysis *AA) {
   ToShrinkSet ToShrink;
@@ -397,7 +405,7 @@
   for (;;) {
     // Erase all dead defs.
     while (!Dead.empty())
-      eliminateDeadDef(Dead.pop_back_val(), ToShrink, AA);
+      eliminateDeadDef(Dead.pop_back_val(), ToShrink, AA, PSI, MBFI);
 
     if (ToShrink.empty())
       break;
@@ -405,7 +413,7 @@
     // Shrink just one live interval. Then delete new dead defs.
     LiveInterval *LI = ToShrink.back();
     ToShrink.pop_back();
-    if (foldAsLoad(LI, Dead))
+    if (foldAsLoad(LI, Dead, PSI, MBFI))
       continue;
     unsigned VReg = LI->reg;
     if (TheDelegate)
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -172,6 +172,13 @@
   initializeMachineBlockFrequencyInfoPass(*PassRegistry::getPassRegistry());
 }
 
+MachineBlockFrequencyInfo::MachineBlockFrequencyInfo(
+      MachineFunction &F,
+      MachineBranchProbabilityInfo &MBPI,
+      MachineLoopInfo &MLI) : MachineFunctionPass(ID) {
+  calculate(F, MBPI, MLI);
+}
+
 MachineBlockFrequencyInfo::~MachineBlockFrequencyInfo() = default;
 
 void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
@@ -41,6 +42,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/TailDuplicator.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -362,6 +364,8 @@
   /// A handle to the post dominator tree.
   MachinePostDominatorTree *MPDT;
 
+  ProfileSummaryInfo *PSI;
+
   /// Duplicator used to duplicate tails during placement.
   ///
   /// Placement decisions can open up new tail duplication opportunities, but
@@ -537,6 +541,7 @@
     if (TailDupPlacement)
       AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
     AU.addRequired<TargetPassConfig>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -554,6 +559,7 @@
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
                     "Branch Probability Basic Block Placement", false, false)
 
@@ -2025,7 +2031,10 @@
   // i.e. when the layout predecessor does not fallthrough to the loop header.
   // In practice this never happens though: there always seems to be a preheader
   // that can fallthrough and that is also placed before the header.
-  if (F->getFunction().hasOptSize())
+  bool OptForSize = F->getFunction().hasOptSize() ||
+                    llvm::shouldOptimizeForSize(L.getHeader(), PSI,
+                                                &MBFI->getMBFI());
+  if (OptForSize)
     return L.getHeader();
 
   MachineBasicBlock *OldTop = nullptr;
@@ -2781,6 +2790,11 @@
     if (Freq < (LoopHeaderFreq * ColdProb))
       continue;
 
+    // If the global profiles indicates so, don't align it.
+    if (llvm::shouldOptimizeForSize(ChainBB, PSI, &MBFI->getMBFI()) &&
+        !TLI->alignLoopsWithOptSize())
+      continue;
+
     // Check for the existence of a non-layout predecessor which would benefit
     // from aligning this block.
     MachineBasicBlock *LayoutPred =
@@ -2988,6 +3002,7 @@
   TII = MF.getSubtarget().getInstrInfo();
   TLI = MF.getSubtarget().getTargetLowering();
   MPDT = nullptr;
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
 
   // Initialize PreferredLoopExit to nullptr here since it may never be set if
   // there are no MachineLoops.
@@ -3018,10 +3033,13 @@
 
   if (allowTailDupPlacement()) {
     MPDT = &getAnalysis<MachinePostDominatorTree>();
-    if (MF.getFunction().hasOptSize())
+    bool OptForSize = MF.getFunction().hasOptSize() ||
+                      llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
+    if (OptForSize)
       TailDupSize = 1;
     bool PreRegAlloc = false;
-    TailDup.initMF(MF, PreRegAlloc, MBPI, /* LayoutMode */ true, TailDupSize);
+    TailDup.initMF(MF, PreRegAlloc, MBPI, &MBFI->getMBFI(), PSI,
+                   /* LayoutMode */ true, TailDupSize);
     precomputeTriangleChains();
   }
 
@@ -3037,7 +3055,7 @@
   if (MF.size() > 3 && EnableTailMerge) {
     unsigned TailMergeSize = TailDupSize + 1;
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
-                    *MBPI, TailMergeSize);
+                    *MBPI, PSI, TailMergeSize);
 
     auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
     if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -67,6 +68,7 @@
     AliasAnalysis *AA;
     MachineDominatorTree *DT;
     MachineRegisterInfo *MRI;
+    ProfileSummaryInfo *PSI;
     MachineBlockFrequencyInfo *MBFI;
 
   public:
@@ -87,6 +89,7 @@
       AU.addPreserved<MachineDominatorTree>();
       AU.addRequired<MachineBlockFrequencyInfo>();
       AU.addPreserved<MachineBlockFrequencyInfo>();
+      AU.addRequired<ProfileSummaryInfoWrapperPass>();
     }
 
     void releaseMemory() override {
@@ -538,7 +541,7 @@
     // Commute commutable instructions.
     bool Commuted = false;
     if (!FoundCSE && MI->isCommutable()) {
-      if (MachineInstr *NewMI = TII->commuteInstruction(*MI)) {
+      if (MachineInstr *NewMI = TII->commuteInstruction(*MI, PSI, MBFI)) {
         Commuted = true;
         FoundCSE = VNT.count(NewMI);
         if (NewMI != MI) {
@@ -547,7 +550,7 @@
           Changed = true;
         } else if (!FoundCSE)
           // MI was changed but it didn't help, commute it back!
-          (void)TII->commuteInstruction(*MI);
+          (void)TII->commuteInstruction(*MI, PSI, MBFI);
       }
     }
 
@@ -889,6 +892,7 @@
   DT = &getAnalysis<MachineDominatorTree>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   LookAheadLimit = TII->getMachineCSELookAheadLimit();
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   bool ChangedPRE, ChangedCSE;
   ChangedPRE = PerformSimplePRE(DT);
   ChangedCSE = PerformCSE(DT->getRootNode());
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -12,11 +12,14 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -66,6 +69,8 @@
   MachineLoopInfo *MLI; // Current MachineLoopInfo
   MachineTraceMetrics *Traces;
   MachineTraceMetrics::Ensemble *MinInstr;
+  MachineBlockFrequencyInfo *MBFI;
+  ProfileSummaryInfo *PSI;
 
   TargetSchedModel TSchedModel;
 
@@ -82,7 +87,7 @@
   StringRef getPassName() const override { return "Machine InstCombiner"; }
 
 private:
-  bool doSubstitute(unsigned NewSize, unsigned OldSize);
+  bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize);
   bool combineInstructions(MachineBasicBlock *);
   MachineInstr *getOperandDef(const MachineOperand &MO);
   unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -131,6 +136,8 @@
   AU.addPreserved<MachineLoopInfo>();
   AU.addRequired<MachineTraceMetrics>();
   AU.addPreserved<MachineTraceMetrics>();
+  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -408,8 +415,9 @@
 
 /// \returns true when new instruction sequence should be generated
 /// independent if it lengthens critical path or not
-bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
-  if (OptSize && (NewSize < OldSize))
+bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize,
+                                   bool OptForSize) {
+  if (OptForSize && (NewSize < OldSize))
     return true;
   if (!TSchedModel.hasInstrSchedModelOrItineraries())
     return true;
@@ -507,6 +515,8 @@
   SparseSet<LiveRegUnit> RegUnits;
   RegUnits.setUniverse(TRI->getNumRegUnits());
 
+  bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
+
   while (BlockIter != MBB->end()) {
     auto &MI = *BlockIter++;
     SmallVector<MachineCombinerPattern, 16> Patterns;
@@ -583,7 +593,8 @@
       // fewer instructions OR
       // the new sequence neither lengthens the critical path nor increases
       // resource pressure.
-      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) {
+      if (SubstituteAlways ||
+          doSubstitute(NewInstCount, OldInstCount, OptForSize)) {
         insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
                                  RegUnits, IncrementalUpdate);
         // Eagerly stop after the first pattern fires.
@@ -638,6 +649,10 @@
   MRI = &MF.getRegInfo();
   MLI = &getAnalysis<MachineLoopInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MBFI = (PSI && PSI->hasProfileSummary()) ?
+         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+         nullptr;
   MinInstr = nullptr;
   OptSize = MF.getFunction().hasOptSize();
 
diff --git a/llvm/lib/CodeGen/MachineDominators.cpp b/llvm/lib/CodeGen/MachineDominators.cpp
--- a/llvm/lib/CodeGen/MachineDominators.cpp
+++ b/llvm/lib/CodeGen/MachineDominators.cpp
@@ -49,11 +49,15 @@
 }
 
 bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  calculate(F);
+  return false;
+}
+
+void MachineDominatorTree::calculate(MachineFunction &F) {
   CriticalEdgesToSplit.clear();
   NewBBs.clear();
   DT.reset(new DomTreeBase<MachineBasicBlock>());
   DT->recalculate(F);
-  return false;
 }
 
 MachineDominatorTree::MachineDominatorTree()
diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -36,11 +36,15 @@
 char &llvm::MachineLoopInfoID = MachineLoopInfo::ID;
 
 bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) {
-  releaseMemory();
-  LI.analyze(getAnalysis<MachineDominatorTree>().getBase());
+  calculate(getAnalysis<MachineDominatorTree>());
   return false;
 }
 
+void MachineLoopInfo::calculate(MachineDominatorTree &MDT) {
+  releaseMemory();
+  LI.analyze(MDT.getBase());
+}
+
 void MachineLoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<MachineDominatorTree>();
diff --git a/llvm/lib/CodeGen/MachineSizeOpts.cpp b/llvm/lib/CodeGen/MachineSizeOpts.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineSizeOpts.cpp
@@ -0,0 +1,120 @@
+//===- MachineSizeOpts.cpp - code size optimization related code ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some shared machine IR code size optimization related
+// code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+
+using namespace llvm;
+
+extern cl::opt<bool> EnablePGSO;
+extern cl::opt<bool> PGSOLargeWorkingSetSizeOnly;
+extern cl::opt<bool> ForcePGSO;
+extern cl::opt<int> PgsoCutoffInstrProf;
+extern cl::opt<int> PgsoCutoffSampleProf;
+
+namespace machine_size_opts_detail {
+
+/// Like ProfileSummaryInfo::isColdBlock but for MachineBasicBlock.
+bool isColdBlock(const MachineBasicBlock *MBB,
+                 ProfileSummaryInfo *PSI,
+                 const MachineBlockFrequencyInfo *MBFI) {
+  auto Count = MBFI->getBlockProfileCount(MBB);
+  return Count && PSI->isColdCount(*Count);
+}
+
+/// Like ProfileSummaryInfo::isHotBlockNthPercentile but for MachineBasicBlock.
+static bool isHotBlockNthPercentile(int PercentileCutoff,
+                                    const MachineBasicBlock *MBB,
+                                    ProfileSummaryInfo *PSI,
+                                    const MachineBlockFrequencyInfo *MBFI) {
+  auto Count = MBFI->getBlockProfileCount(MBB);
+  return Count && PSI->isHotCountNthPercentile(PercentileCutoff, *Count);
+}
+
+/// Like ProfileSummaryInfo::isFunctionColdInCallGraph but for
+/// MachineFunction.
+bool isFunctionColdInCallGraph(
+    const MachineFunction *MF,
+    ProfileSummaryInfo *PSI,
+    const MachineBlockFrequencyInfo &MBFI) {
+  if (auto FunctionCount = MF->getFunction().getEntryCount())
+    if (!PSI->isColdCount(FunctionCount.getCount()))
+      return false;
+  for (const auto &MBB : *MF)
+    if (!isColdBlock(&MBB, PSI, &MBFI))
+      return false;
+  return true;
+}
+
+/// Like ProfileSummaryInfo::isFunctionHotInCallGraphNthPercentile but for
+/// MachineFunction.
+bool isFunctionHotInCallGraphNthPercentile(
+    int PercentileCutoff,
+    const MachineFunction *MF,
+    ProfileSummaryInfo *PSI,
+    const MachineBlockFrequencyInfo &MBFI) {
+  if (auto FunctionCount = MF->getFunction().getEntryCount())
+    if (PSI->isHotCountNthPercentile(PercentileCutoff,
+                                     FunctionCount.getCount()))
+      return true;
+  for (const auto &MBB : *MF)
+    if (isHotBlockNthPercentile(PercentileCutoff, &MBB, PSI, &MBFI))
+      return true;
+  return false;
+}
+} // namespace machine_size_opts_detail
+
+namespace {
+struct MachineBasicBlockBFIAdapter {
+  static bool isFunctionColdInCallGraph(const MachineFunction *MF,
+                                        ProfileSummaryInfo *PSI,
+                                        const MachineBlockFrequencyInfo &MBFI) {
+    return machine_size_opts_detail::isFunctionColdInCallGraph(MF, PSI, MBFI);
+  }
+  static bool isFunctionHotInCallGraphNthPercentile(
+      int CutOff,
+      const MachineFunction *MF,
+      ProfileSummaryInfo *PSI,
+      const MachineBlockFrequencyInfo &MBFI) {
+    return machine_size_opts_detail::isFunctionHotInCallGraphNthPercentile(
+        CutOff, MF, PSI, MBFI);
+  }
+  static bool isColdBlock(const MachineBasicBlock *MBB,
+                          ProfileSummaryInfo *PSI,
+                          const MachineBlockFrequencyInfo *MBFI) {
+    return machine_size_opts_detail::isColdBlock(MBB, PSI, MBFI);
+  }
+  static bool isHotBlockNthPercentile(int CutOff,
+                                      const MachineBasicBlock *MBB,
+                                      ProfileSummaryInfo *PSI,
+                                      const MachineBlockFrequencyInfo *MBFI) {
+    return machine_size_opts_detail::isHotBlockNthPercentile(
+        CutOff, MBB, PSI, MBFI);
+  }
+};
+} // end anonymous namespace
+
+bool llvm::shouldOptimizeForSize(const MachineFunction *MF,
+                                 ProfileSummaryInfo *PSI,
+                                 const MachineBlockFrequencyInfo *MBFI) {
+  return shouldFuncOptimizeForSizeImpl<MachineBasicBlockBFIAdapter>(
+      MF, PSI, MBFI);
+}
+
+bool llvm::shouldOptimizeForSize(const MachineBasicBlock *MBB,
+                                 ProfileSummaryInfo *PSI,
+                                 const MachineBlockFrequencyInfo *MBFI) {
+  return shouldOptimizeForSizeImpl<MachineBasicBlockBFIAdapter>(
+      MBB, PSI, MBFI);
+}
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -71,6 +71,8 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -156,6 +158,8 @@
     MachineRegisterInfo *MRI;
     MachineDominatorTree *DT;  // Machine dominator tree
     MachineLoopInfo *MLI;
+    ProfileSummaryInfo *PSI;
+    MachineBlockFrequencyInfo *MBFI;
 
   public:
     static char ID; // Pass identification
@@ -175,6 +179,8 @@
         AU.addRequired<MachineDominatorTree>();
         AU.addPreserved<MachineDominatorTree>();
       }
+      AU.addRequired<ProfileSummaryInfoWrapperPass>();
+      AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
     }
 
     /// Track Def -> Use info used for rewriting copies.
@@ -1580,7 +1586,7 @@
       auto CP = RI.getCommutePair();
       if (CP) {
         Changed = true;
-        TII->commuteInstruction(*(RI.getMI()), false, (*CP).first,
+        TII->commuteInstruction(*(RI.getMI()), PSI, MBFI, false, (*CP).first,
                                 (*CP).second);
         LLVM_DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI()));
       }
@@ -1605,6 +1611,10 @@
   MRI = &MF.getRegInfo();
   DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
   MLI = &getAnalysis<MachineLoopInfo>();
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MBFI = (PSI && PSI->hasProfileSummary()) ?
+         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+         nullptr;
 
   bool Changed = false;
 
@@ -1767,7 +1777,8 @@
             unsigned FoldedReg = FoldAsLoadDefReg;
             MachineInstr *DefMI = nullptr;
             if (MachineInstr *FoldMI =
-                    TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) {
+                    TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI,
+                                           PSI, MBFI)) {
               // Update LocalMIs since we replaced MI with FoldMI and deleted
               // DefMI.
               LLVM_DEBUG(dbgs() << "Replacing: " << *MI);
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -16,6 +16,7 @@
 #include "RegAllocBase.h"
 #include "Spiller.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
@@ -190,6 +191,7 @@
   AU.addPreserved<VirtRegMap>();
   AU.addRequired<LiveRegMatrix>();
   AU.addPreserved<LiveRegMatrix>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();  // Needed for InlineSpiller.
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/EdgeBundles.h"
 #include "llvm/CodeGen/LiveInterval.h"
@@ -626,6 +627,7 @@
   AU.addRequired<EdgeBundles>();
   AU.addRequired<SpillPlacement>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();  // Needed for InlineSpiller.
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -3239,6 +3241,7 @@
   SpillPlacer = &getAnalysis<SpillPlacement>();
   DebugVars = &getAnalysis<LiveDebugVariables>();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
 
   initializeCSRCost();
 
@@ -3247,7 +3250,7 @@
   LLVM_DEBUG(LIS->dump());
 
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
-  SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI));
+  SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI, PSI));
   ExtraRegInfo.clear();
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
   NextCascade = 1;
diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp
--- a/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -40,6 +40,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -549,6 +550,7 @@
   au.addPreserved<MachineDominatorTree>();
   au.addRequired<VirtRegMap>();
   au.addPreserved<VirtRegMap>();
+  au.addRequired<ProfileSummaryInfoWrapperPass>();  // Needed for InlineSpiller.
   MachineFunctionPass::getAnalysisUsage(au);
 }
 
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -21,6 +21,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
@@ -129,6 +131,8 @@
     const MachineLoopInfo* Loops;
     AliasAnalysis *AA;
     RegisterClassInfo RegClassInfo;
+    ProfileSummaryInfo *PSI;
+    MachineBlockFrequencyInfo *MBFI;
 
     /// A LaneMask to remember on which subregister live ranges we need to call
     /// shrinkToUses() later.
@@ -538,13 +542,15 @@
   AU.addRequired<MachineLoopInfo>();
   AU.addPreserved<MachineLoopInfo>();
   AU.addPreservedID(MachineDominatorsID);
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 void RegisterCoalescer::eliminateDeadDefs() {
   SmallVector<unsigned, 8> NewRegs;
   LiveRangeEdit(nullptr, NewRegs, *MF, *LIS,
-                nullptr, this).eliminateDeadDefs(DeadDefs);
+                nullptr, this).eliminateDeadDefs(DeadDefs, PSI, MBFI);
 }
 
 void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
@@ -832,7 +838,7 @@
   // transformation.  Start by commuting the instruction.
   MachineBasicBlock *MBB = DefMI->getParent();
   MachineInstr *NewMI =
-      TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx);
+      TII->commuteInstruction(*DefMI, PSI, MBFI, false, UseOpIdx, NewDstIdx);
   if (!NewMI)
     return { false, false };
   if (Register::isVirtualRegister(IntA.reg) &&
@@ -3686,6 +3692,10 @@
     JoinGlobalCopies = STI.enableJoinGlobalCopies();
   else
     JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MBFI = (PSI && PSI->hasProfileSummary()) ?
+         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+         nullptr;
 
   // The MachineScheduler does not currently require JoinSplitEdges. This will
   // either be enabled unconditionally or replaced by a more general live range
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -217,7 +217,7 @@
     DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
         : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
           OptLevel(OL), AA(AA) {
-      ForCodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+      ForCodeSize = DAG.shouldOptForSize();
 
       MaximumLegalStoreInBits = 0;
       for (MVT VT : MVT::all_valuetypes())
@@ -12645,7 +12645,7 @@
 
     // Assume that libcalls are the smallest code.
     // TODO: This restriction should probably be lifted for vectors.
-    if (DAG.getMachineFunction().getFunction().hasOptSize())
+    if (ForCodeSize)
       return SDValue();
 
     // pow(X, 0.25) --> sqrt(sqrt(X))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -24,6 +24,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -63,6 +65,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -1005,7 +1008,9 @@
 void SelectionDAG::init(MachineFunction &NewMF,
                         OptimizationRemarkEmitter &NewORE,
                         Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
-                        LegacyDivergenceAnalysis * Divergence) {
+                        LegacyDivergenceAnalysis * Divergence,
+                        ProfileSummaryInfo *PSIin,
+                        BlockFrequencyInfo *BFIin) {
   MF = &NewMF;
   SDAGISelPass = PassPtr;
   ORE = &NewORE;
@@ -1014,6 +1019,8 @@
   LibInfo = LibraryInfo;
   Context = &MF->getFunction().getContext();
   DA = Divergence;
+  PSI = PSIin;
+  BFI = BFIin;
 }
 
 SelectionDAG::~SelectionDAG() {
@@ -1023,6 +1030,11 @@
   delete DbgInfo;
 }
 
+bool SelectionDAG::shouldOptForSize() const {
+  return MF->getFunction().hasOptSize() ||
+      llvm::shouldOptimizeForSize(FLI->MBB->getBasicBlock(), PSI, BFI);
+}
+
 void SelectionDAG::allnodes_clear() {
   assert(&*AllNodes.begin() == &EntryNode);
   AllNodes.remove(AllNodes.begin());
@@ -1425,7 +1437,7 @@
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
-    Alignment = MF->getFunction().hasOptSize()
+    Alignment = shouldOptForSize()
                     ? getDataLayout().getABITypeAlignment(C->getType())
                     : getDataLayout().getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
@@ -5722,12 +5734,13 @@
                                   SrcDelta + G->getOffset());
 }
 
-static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
+static bool shouldLowerMemFuncForSize(const MachineFunction &MF,
+                                      SelectionDAG &DAG) {
   // On Darwin, -Os means optimize for size without hurting performance, so
   // only really optimize for size when -Oz (MinSize) is used.
   if (MF.getTarget().getTargetTriple().isOSDarwin())
     return MF.getFunction().hasMinSize();
-  return MF.getFunction().hasOptSize();
+  return DAG.shouldOptForSize();
 }
 
 static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
@@ -5777,7 +5790,7 @@
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  bool OptSize = shouldLowerMemFuncForSize(MF);
+  bool OptSize = shouldLowerMemFuncForSize(MF, DAG);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -5960,7 +5973,7 @@
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  bool OptSize = shouldLowerMemFuncForSize(MF);
+  bool OptSize = shouldLowerMemFuncForSize(MF, DAG);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -6066,7 +6079,7 @@
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  bool OptSize = shouldLowerMemFuncForSize(MF);
+  bool OptSize = shouldLowerMemFuncForSize(MF, DAG);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -28,10 +28,12 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -5329,8 +5331,8 @@
     if (Val == 0)
       return DAG.getConstantFP(1.0, DL, LHS.getValueType());
 
-    const Function &F = DAG.getMachineFunction().getFunction();
-    if (!F.hasOptSize() ||
+    bool OptForSize = DAG.shouldOptForSize();
+    if (!OptForSize ||
         // If optimizing for size, don't insert too many multiplies.
         // This inserts up to 5 multiplies.
         countPopulation(Val) + Log2_32(Val) < 7) {
@@ -10508,7 +10510,7 @@
     return;
   }
 
-  SL->findJumpTables(Clusters, &SI, DefaultMBB);
+  SL->findJumpTables(Clusters, &SI, DefaultMBB, DAG.getPSI(), DAG.getBFI());
   SL->findBitTestClusters(Clusters, &SI);
 
   LLVM_DEBUG({
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -27,7 +27,9 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/FastISel.h"
@@ -339,6 +341,8 @@
   AU.addRequired<TargetTransformInfoWrapperPass>();
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     AU.addRequired<BranchProbabilityInfoWrapperPass>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -441,13 +445,17 @@
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+              &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
+              nullptr;
 
   LLVM_DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function &>(Fn), DT, LI);
 
   CurDAG->init(*MF, *ORE, this, LibInfo,
-   getAnalysisIfAvailable<LegacyDivergenceAnalysis>());
+               getAnalysisIfAvailable<LegacyDivergenceAnalysis>(), PSI, BFI);
   FuncInfo->set(Fn, *MF, CurDAG);
   SwiftError->setFunction(*MF);
 
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -36,6 +36,7 @@
 
 class LiveIntervals;
 class LiveRangeEdit;
+class ProfileSummaryInfo;
 class MachineBlockFrequencyInfo;
 class MachineDominatorTree;
 class MachineLoopInfo;
@@ -264,6 +265,7 @@
   const TargetInstrInfo &TII;
   const TargetRegisterInfo &TRI;
   const MachineBlockFrequencyInfo &MBFI;
+  ProfileSummaryInfo *PSI;
 
 public:
   /// ComplementSpillMode - Select how the complement live range should be
@@ -444,7 +446,8 @@
   /// Newly created intervals will be appended to newIntervals.
   SplitEditor(SplitAnalysis &sa, AliasAnalysis &aa, LiveIntervals &lis,
               VirtRegMap &vrm, MachineDominatorTree &mdt,
-              MachineBlockFrequencyInfo &mbfi);
+              MachineBlockFrequencyInfo &mbfi,
+              ProfileSummaryInfo *PSI);
 
   /// reset - Prepare for a new split.
   void reset(LiveRangeEdit&, ComplementSpillMode = SM_Partition);
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -365,12 +365,13 @@
 SplitEditor::SplitEditor(SplitAnalysis &sa, AliasAnalysis &aa,
                          LiveIntervals &lis, VirtRegMap &vrm,
                          MachineDominatorTree &mdt,
-                         MachineBlockFrequencyInfo &mbfi)
+                         MachineBlockFrequencyInfo &mbfi,
+                         ProfileSummaryInfo *PSI)
     : SA(sa), AA(aa), LIS(lis), VRM(vrm),
       MRI(vrm.getMachineFunction().getRegInfo()), MDT(mdt),
       TII(*vrm.getMachineFunction().getSubtarget().getInstrInfo()),
       TRI(*vrm.getMachineFunction().getSubtarget().getRegisterInfo()),
-      MBFI(mbfi), RegAssign(Allocator) {}
+      MBFI(mbfi), PSI(PSI), RegAssign(Allocator) {}
 
 void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
   Edit = &LRE;
@@ -1429,7 +1430,7 @@
   if (Dead.empty())
     return;
 
-  Edit->eliminateDeadDefs(Dead, None, &AA);
+  Edit->eliminateDeadDefs(Dead, PSI, &MBFI, None, &AA);
 }
 
 void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) {
diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
--- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
@@ -40,9 +40,12 @@
   return NumCases;
 }
 
-void SwitchCG::SwitchLowering::findJumpTables(CaseClusterVector &Clusters,
-                                              const SwitchInst *SI,
-                                              MachineBasicBlock *DefaultMBB) {
+void SwitchCG::SwitchLowering::findJumpTables(
+    CaseClusterVector &Clusters,
+    const SwitchInst *SI,
+    MachineBasicBlock *DefaultMBB,
+    ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI) {
 #ifndef NDEBUG
   // Clusters must be non-empty, sorted, and only contain Range clusters.
   assert(!Clusters.empty());
@@ -80,7 +83,7 @@
   assert(Range >= NumCases);
 
   // Cheap case: the whole range may be suitable for jump table.
-  if (TLI->isSuitableForJumpTable(SI, NumCases, Range)) {
+  if (TLI->isSuitableForJumpTable(SI, NumCases, Range, PSI, BFI)) {
     CaseCluster JTCluster;
     if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) {
       Clusters[0] = JTCluster;
@@ -138,7 +141,7 @@
       assert(NumCases < UINT64_MAX / 100);
       assert(Range >= NumCases);
 
-      if (TLI->isSuitableForJumpTable(SI, NumCases, Range)) {
+      if (TLI->isSuitableForJumpTable(SI, NumCases, Range, PSI, BFI)) {
         unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
         unsigned Score = j == N - 1 ? 0 : PartitionsScore[j + 1];
         int64_t NumEntries = j - i + 1;
diff --git a/llvm/lib/CodeGen/TailDuplication.cpp b/llvm/lib/CodeGen/TailDuplication.cpp
--- a/llvm/lib/CodeGen/TailDuplication.cpp
+++ b/llvm/lib/CodeGen/TailDuplication.cpp
@@ -12,6 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -37,6 +39,8 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -74,7 +78,11 @@
     return false;
 
   auto MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
-  Duplicator.initMF(MF, PreRegAlloc, MBPI, /*LayoutMode=*/false);
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
+               &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+               nullptr;
+  Duplicator.initMF(MF, PreRegAlloc, MBPI, MBFI, PSI, /*LayoutMode=*/false);
 
   bool MadeChange = false;
   while (Duplicator.tailDuplicateBlocks())
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -19,13 +19,16 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -77,6 +80,8 @@
 
 void TailDuplicator::initMF(MachineFunction &MFin, bool PreRegAlloc,
                             const MachineBranchProbabilityInfo *MBPIin,
+                            const MachineBlockFrequencyInfo *MBFIin,
+                            ProfileSummaryInfo *PSIin,
                             bool LayoutModeIn, unsigned TailDupSizeIn) {
   MF = &MFin;
   TII = MF->getSubtarget().getInstrInfo();
@@ -84,6 +89,8 @@
   MRI = &MF->getRegInfo();
   MMI = &MF->getMMI();
   MBPI = MBPIin;
+  MBFI = MBFIin;
+  PSI = PSIin;
   TailDupSize = TailDupSizeIn;
 
   assert(MBPI != nullptr && "Machine Branch Probability Info required");
@@ -555,14 +562,14 @@
   // duplicate only one, because one branch instruction can be eliminated to
   // compensate for the duplication.
   unsigned MaxDuplicateCount;
-  if (TailDupSize == 0 &&
-      TailDuplicateSize.getNumOccurrences() == 0 &&
-      MF->getFunction().hasOptSize())
-    MaxDuplicateCount = 1;
-  else if (TailDupSize == 0)
+  bool OptForSize = MF->getFunction().hasOptSize() ||
+                    llvm::shouldOptimizeForSize(&TailBB, PSI, MBFI);
+  if (TailDupSize == 0)
     MaxDuplicateCount = TailDuplicateSize;
   else
     MaxDuplicateCount = TailDupSize;
+  if (OptForSize)
+    MaxDuplicateCount = 1;
 
   // If the block to be duplicated ends in an unanalyzable fallthrough, don't
   // duplicate it.
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -154,6 +154,8 @@
 }
 
 MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                      ProfileSummaryInfo *PSI,
+                                                      const MachineBlockFrequencyInfo *MBFI,
                                                       bool NewMI, unsigned Idx1,
                                                       unsigned Idx2) const {
   const MCInstrDesc &MCID = MI.getDesc();
@@ -236,7 +238,10 @@
   return CommutedMI;
 }
 
-MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr &MI, bool NewMI,
+MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr &MI,
+                                                  ProfileSummaryInfo *PSI,
+                                                  const MachineBlockFrequencyInfo *MBFI,
+                                                  bool NewMI,
                                                   unsigned OpIdx1,
                                                   unsigned OpIdx2) const {
   // If OpIdx1 or OpIdx2 is not specified, then this method is free to choose
@@ -248,7 +253,7 @@
            "Precondition violation: MI must be commutable.");
     return nullptr;
   }
-  return commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+  return commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1, OpIdx2);
 }
 
 bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1,
@@ -530,6 +535,8 @@
 
 MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
                                                  ArrayRef<unsigned> Ops, int FI,
+                                                 ProfileSummaryInfo *PSI,
+                                                 const MachineBlockFrequencyInfo *MBFI,
                                                  LiveIntervals *LIS,
                                                  VirtRegMap *VRM) const {
   auto Flags = MachineMemOperand::MONone;
@@ -577,7 +584,7 @@
       MBB->insert(MI, NewMI);
   } else {
     // Ask the target to do the actual folding.
-    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, LIS, VRM);
+    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, PSI, MBFI, LIS, VRM);
   }
 
   if (NewMI) {
@@ -619,6 +626,8 @@
 MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
                                                  ArrayRef<unsigned> Ops,
                                                  MachineInstr &LoadMI,
+                                                 ProfileSummaryInfo *PSI,
+                                                 const MachineBlockFrequencyInfo *MBFI,
                                                  LiveIntervals *LIS) const {
   assert(LoadMI.canFoldAsLoad() && "LoadMI isn't foldable!");
 #ifndef NDEBUG
@@ -643,7 +652,7 @@
       NewMI = &*MBB.insert(MI, NewMI);
   } else {
     // Ask the target to do the actual folding.
-    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI, LIS);
+    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI, PSI, MBFI, LIS);
   }
 
   if (!NewMI)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -33,6 +33,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveVariables.h"
@@ -99,6 +101,8 @@
   LiveIntervals *LIS;
   AliasAnalysis *AA;
   CodeGenOpt::Level OptLevel;
+  ProfileSummaryInfo *PSI;
+  MachineBlockFrequencyInfo *MBFI;
 
   // The current basic block being processed.
   MachineBasicBlock *MBB;
@@ -188,6 +192,8 @@
     AU.addPreserved<LiveIntervals>();
     AU.addPreservedID(MachineLoopInfoID);
     AU.addPreservedID(MachineDominatorsID);
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -684,7 +690,8 @@
                                                    unsigned Dist) {
   Register RegC = MI->getOperand(RegCIdx).getReg();
   LLVM_DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
-  MachineInstr *NewMI = TII->commuteInstruction(*MI, false, RegBIdx, RegCIdx);
+  MachineInstr *NewMI = TII->commuteInstruction(*MI, PSI, MBFI, false, RegBIdx,
+                                                RegCIdx);
 
   if (NewMI == nullptr) {
     LLVM_DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
@@ -1681,6 +1688,10 @@
   // fixups are necessary for correctness.
   if (skipFunction(Func.getFunction()))
     OptLevel = CodeGenOpt::None;
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MBFI = (PSI && PSI->hasProfileSummary()) ?
+         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+         nullptr;
 
   bool MadeChange = false;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -162,6 +162,8 @@
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                         ArrayRef<unsigned> Ops,
                         MachineBasicBlock::iterator InsertPt, int FrameIndex,
+                        ProfileSummaryInfo *PSI,
+                        const MachineBlockFrequencyInfo *MBFI,
                         LiveIntervals *LIS = nullptr,
                         VirtRegMap *VRM = nullptr) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3165,6 +3165,7 @@
 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, int FrameIndex,
+    ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI,
     LiveIntervals *LIS, VirtRegMap *VRM) const {
   // This is a bit of a hack. Consider this instruction:
   //
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -466,7 +466,7 @@
       auto *BB = OrigMI.getParent();
       auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
       BB->insert(OrigMI, NewMI);
-      if (TII->commuteInstruction(*NewMI)) {
+      if (TII->commuteInstruction(*NewMI, nullptr, nullptr)) {
         LLVM_DEBUG(dbgs() << "  commuted:  " << *NewMI);
         if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR,
                                           OldOpndValue, CombBCZ)) {
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -273,7 +273,7 @@
     MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
 
     if (Fold.isCommuted())
-      TII.commuteInstruction(*Inst32, false);
+      TII.commuteInstruction(*Inst32, nullptr, nullptr, false);
     return true;
   }
 
@@ -377,7 +377,8 @@
       return false;
 
     if (!CanCommute ||
-        !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
+        !TII->commuteInstruction(*MI, nullptr, nullptr, false, CommuteIdx0,
+                                 CommuteIdx1))
       return false;
 
     if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
@@ -406,7 +407,8 @@
         return true;
       }
 
-      TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
+      TII->commuteInstruction(*MI, nullptr, nullptr, false, CommuteIdx0,
+                              CommuteIdx1);
       return false;
     }
 
@@ -1119,7 +1121,7 @@
       tryFoldInst(TII, Fold.UseMI);
     } else if (Fold.isCommuted()) {
       // Restoring instruction's original operand order if fold has failed.
-      TII->commuteInstruction(*Fold.UseMI, false);
+      TII->commuteInstruction(*Fold.UseMI, nullptr, nullptr, false);
     }
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -371,7 +371,7 @@
   MachineOperand &Op1 = A->getOperand(1);
   MachineOperand &Op2 = A->getOperand(2);
   if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
-    TII->commuteInstruction(*A);
+    TII->commuteInstruction(*A, nullptr, nullptr);
     Changed = true;
   }
   if (Op1.getReg() != ExecReg)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -137,7 +137,10 @@
                            MachineOperand &Src0, unsigned Src0OpName,
                            MachineOperand &Src1, unsigned Src1OpName) const;
 
-  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI,
+                                       ProfileSummaryInfo *PSI,
+                                       const MachineBlockFrequencyInfo *MBFI,
+                                       bool NewMI,
                                        unsigned OpIdx0,
                                        unsigned OpIdx1) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1611,7 +1611,10 @@
   return &MI;
 }
 
-MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                  ProfileSummaryInfo *PSI,
+                                                  const MachineBlockFrequencyInfo *MBFI,
+                                                  bool NewMI,
                                                   unsigned Src0Idx,
                                                   unsigned Src1Idx) const {
   assert(!NewMI && "this should never be used");
@@ -1635,7 +1638,8 @@
     if (isOperandLegal(MI, Src1Idx, &Src0)) {
       // Be sure to copy the source modifiers to the right place.
       CommutedMI
-        = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
+          = TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI,
+                                                    Src0Idx, Src1Idx);
     }
 
   } else if (Src0.isReg() && !Src1.isReg()) {
@@ -2401,7 +2405,7 @@
         if (Def && Def->isMoveImmediate() &&
             isInlineConstant(Def->getOperand(1)) &&
             MRI->hasOneUse(Src1->getReg()) &&
-            commuteInstruction(UseMI)) {
+            commuteInstruction(UseMI, nullptr, nullptr)) {
             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
         } else if ((Register::isPhysicalRegister(Src1->getReg()) &&
                     RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -113,12 +113,12 @@
 
   // We have failed to fold src0, so commute the instruction and try again.
   if (TryToCommute && MI.isCommutable()) {
-    if (TII->commuteInstruction(MI)) {
+    if (TII->commuteInstruction(MI, nullptr, nullptr)) {
       if (foldImmediates(MI, TII, MRI, false))
         return true;
 
       // Commute back.
-      TII->commuteInstruction(MI);
+      TII->commuteInstruction(MI, nullptr, nullptr);
     }
   }
 
@@ -183,7 +183,7 @@
   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
   // get constants on the RHS.
   if (!MI.getOperand(0).isReg())
-    TII->commuteInstruction(MI, false, 0, 1);
+    TII->commuteInstruction(MI, nullptr, nullptr, false, 0, 1);
 
   const MachineOperand &Src1 = MI.getOperand(1);
   if (!Src1.isImm())
@@ -355,7 +355,7 @@
 
     if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
         SrcImm == Src0) {
-      if (!TII->commuteInstruction(MI, false, 1, 2))
+      if (!TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2))
         NewImm = 0;
     }
 
@@ -634,7 +634,7 @@
         MachineOperand *Src1 = &MI.getOperand(2);
 
         if (!Src0->isReg() && Src1->isReg()) {
-          if (TII->commuteInstruction(MI, false, 1, 2))
+          if (TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2))
             std::swap(Src0, Src1);
         }
 
@@ -704,7 +704,8 @@
       if (!TII->canShrink(MI, MRI)) {
         // Try commuting the instruction and see if that enables us to shrink
         // it.
-        if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
+        if (!MI.isCommutable() ||
+            !TII->commuteInstruction(MI, nullptr, nullptr) ||
             !TII->canShrink(MI, MRI))
           continue;
       }
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -96,7 +96,10 @@
   /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
   /// Even though the instruction is commutable, the method may still
   /// fail to commute the operands, null pointer is returned in such cases.
-  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI,
+                                       ProfileSummaryInfo *PSI,
+                                       const MachineBlockFrequencyInfo *MBFI,
+                                       bool NewMI,
                                        unsigned OpIdx1,
                                        unsigned OpIdx2) const override;
 
@@ -326,7 +329,9 @@
   /// VFP/NEON execution domains.
   std::pair<uint16_t, uint16_t>
   getExecutionDomain(const MachineInstr &MI) const override;
-  void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
+  void setExecutionDomain(MachineInstr &MI, unsigned Domain,
+                          ProfileSummaryInfo *PSI,
+                          const MachineBlockFrequencyInfo *MBFI) const override;
 
   unsigned
   getPartialRegUpdateClearance(const MachineInstr &, unsigned,
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2146,6 +2146,8 @@
 }
 
 MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                       ProfileSummaryInfo *PSI,
+                                                       const MachineBlockFrequencyInfo *MBFI,
                                                        bool NewMI,
                                                        unsigned OpIdx1,
                                                        unsigned OpIdx2) const {
@@ -2159,7 +2161,8 @@
     if (CC == ARMCC::AL || PredReg != ARM::CPSR)
       return nullptr;
     MachineInstr *CommutedMI =
-        TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+        TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1,
+                                                OpIdx2);
     if (!CommutedMI)
       return nullptr;
     // After swapping the MOVCC operands, also invert the condition.
@@ -2168,7 +2171,8 @@
     return CommutedMI;
   }
   }
-  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+  return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1,
+                                                 OpIdx2);
 }
 
 /// Identify instructions that can be folded into a MOVCC instruction, and
@@ -4926,7 +4930,9 @@
 }
 
 void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
-                                          unsigned Domain) const {
+                                          unsigned Domain,
+                                          ProfileSummaryInfo *PSI,
+                                          const MachineBlockFrequencyInfo *MBFI) const {
   unsigned DstReg, SrcReg, DReg;
   unsigned Lane;
   MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
--- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -758,7 +758,7 @@
       if (Reg1 != Reg0)
         return false;
       // Try to commute the operands to make it a 2-address instruction.
-      MachineInstr *CommutedMI = TII->commuteInstruction(*MI);
+      MachineInstr *CommutedMI = TII->commuteInstruction(*MI, nullptr, nullptr);
       if (!CommutedMI)
         return false;
     }
@@ -770,7 +770,8 @@
         MI->getOperand(CommOpIdx2).getReg() != Reg0)
       return false;
     MachineInstr *CommutedMI =
-        TII->commuteInstruction(*MI, false, CommOpIdx1, CommOpIdx2);
+        TII->commuteInstruction(*MI, nullptr, nullptr, false,
+                                CommOpIdx1, CommOpIdx2);
     if (!CommutedMI)
       return false;
   }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -171,7 +171,10 @@
   ///
   /// For example, we can commute rlwimi instructions, but only if the
   /// rotate amt is zero.  We also have to munge the immediates a bit.
-  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI,
+                                       ProfileSummaryInfo *PSI,
+                                       const MachineBlockFrequencyInfo *MBFI,
+                                       bool NewMI,
                                        unsigned OpIdx1,
                                        unsigned OpIdx2) const override;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -364,14 +364,18 @@
   return 0;
 }
 
-MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                   ProfileSummaryInfo *PSI,
+                                                   const MachineBlockFrequencyInfo *MBFI,
+                                                   bool NewMI,
                                                    unsigned OpIdx1,
                                                    unsigned OpIdx2) const {
   MachineFunction &MF = *MI.getParent()->getParent();
 
   // Normal instructions can be commuted the obvious way.
   if (MI.getOpcode() != PPC::RLWIMI && MI.getOpcode() != PPC::RLWIMIo)
-    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI, OpIdx1,
+                                                   OpIdx2);
   // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
   // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
   // changing the relative order of the mask operands might change what happens
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -194,7 +194,10 @@
   /// non-commutable operands.
   /// Even though the instruction is commutable, the method may still
   /// fail to commute the operands, null pointer is returned in such cases.
-  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI,
+                                       ProfileSummaryInfo *PSI,
+                                       const MachineBlockFrequencyInfo *MBFI,
+                                       bool NewMI,
                                        unsigned CommuteOpIdx1,
                                        unsigned CommuteOpIdx2) const override;
 
@@ -261,11 +264,15 @@
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                         ArrayRef<unsigned> Ops,
                         MachineBasicBlock::iterator InsertPt, int FrameIndex,
+                        ProfileSummaryInfo *PSI,
+                        const MachineBlockFrequencyInfo *MBFI,
                         LiveIntervals *LIS = nullptr,
                         VirtRegMap *VRM = nullptr) const override;
   MachineInstr *foldMemoryOperandImpl(
       MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
       MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+      ProfileSummaryInfo *PSI,
+      const MachineBlockFrequencyInfo *MBFI,
       LiveIntervals *LIS = nullptr) const override;
   bool expandPostRAPseudo(MachineInstr &MBBI) const override;
   bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -270,6 +270,8 @@
 }
 
 MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                       ProfileSummaryInfo *PSI,
+                                                       const MachineBlockFrequencyInfo *MBFI,
                                                        bool NewMI,
                                                        unsigned OpIdx1,
                                                        unsigned OpIdx2) const {
@@ -293,11 +295,13 @@
     unsigned CCValid = WorkingMI.getOperand(3).getImm();
     unsigned CCMask = WorkingMI.getOperand(4).getImm();
     WorkingMI.getOperand(4).setImm(CCMask ^ CCValid);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   default:
-    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI,
+                                                   NewMI, OpIdx1, OpIdx2);
   }
 }
 
@@ -655,7 +659,7 @@
   }
 
   if (CommuteIdx != -1)
-    if (!commuteInstruction(UseMI, false, CommuteIdx, UseIdx))
+    if (!commuteInstruction(UseMI, nullptr, nullptr, false, CommuteIdx, UseIdx))
       return false;
 
   bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
@@ -997,6 +1001,7 @@
 MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, int FrameIndex,
+    ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI,
     LiveIntervals *LIS, VirtRegMap *VRM) const {
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1210,6 +1215,7 @@
 MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+    ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI,
     LiveIntervals *LIS) const {
   return nullptr;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
--- a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
@@ -137,7 +137,7 @@
 
   // If the destination (now) matches one source, prefer this to be first.
   if (DestReg != Src1Reg && DestReg == Src2Reg) {
-    TII->commuteInstruction(*MBBI, false, 1, 2);
+    TII->commuteInstruction(*MBBI, nullptr, nullptr, false, 1, 2);
     std::swap(Src1Reg, Src2Reg);
     std::swap(Src1IsHigh, Src2IsHigh);
   }
@@ -269,4 +269,3 @@
 
   return Modified;
 }
-
diff --git a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
--- a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -185,7 +185,7 @@
     return true;
   }
   if (MI.getOperand(0).getReg() == MI.getOperand(2).getReg()) {
-    TII->commuteInstruction(MI, false, 1, 2);
+    TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2);
     MI.setDesc(TII->get(Opcode));
     MI.tieOperands(0, 1);
     return true;
@@ -338,7 +338,7 @@
       if ((MI.getOperand(0).getReg() != MI.getOperand(1).getReg()) &&
           (!MI.isCommutable() ||
            MI.getOperand(0).getReg() != MI.getOperand(2).getReg() ||
-           !TII->commuteInstruction(MI, false, 1, 2)))
+           !TII->commuteInstruction(MI, nullptr, nullptr, false, 1, 2)))
           break;
 
       MI.setDesc(TII->get(TwoOperandOpcode));
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -48,7 +48,10 @@
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
-  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI,
+                                       ProfileSummaryInfo *PSI,
+                                       const MachineBlockFrequencyInfo *MBFI,
+                                       bool NewMI,
                                        unsigned OpIdx1,
                                        unsigned OpIdx2) const override;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -85,7 +85,9 @@
 }
 
 MachineInstr *WebAssemblyInstrInfo::commuteInstructionImpl(
-    MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const {
+    MachineInstr &MI,
+    ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI,
+    bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const {
   // If the operands are stackified, we can't reorder them.
   WebAssemblyFunctionInfo &MFI =
       *MI.getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>();
@@ -94,7 +96,8 @@
     return nullptr;
 
   // Otherwise use the default implementation.
-  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+  return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI, NewMI,
+                                                 OpIdx1, OpIdx2);
 }
 
 // Branch analysis.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -740,7 +740,8 @@
       assert(!Declined &&
              "Don't decline commuting until you've finished trying it");
       // Commuting didn't help. Revert it.
-      TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
+      TII->commuteInstruction(*Insert, nullptr, nullptr, /*NewMI=*/false,
+                              Operand0, Operand1);
       TentativelyCommuting = false;
       Declined = true;
     } else if (!Declined && TreeWalker.hasRemainingOperands(Insert)) {
@@ -748,7 +749,8 @@
       Operand1 = TargetInstrInfo::CommuteAnyOperandIndex;
       if (TII->findCommutedOpIndices(*Insert, Operand0, Operand1)) {
         // Tentatively commute the operands and try again.
-        TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
+        TII->commuteInstruction(*Insert, nullptr, nullptr, /*NewMI=*/false,
+                                Operand0, Operand1);
         TreeWalker.resetTopOperands(Insert);
         TentativelyCommuting = true;
         Declined = false;
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -179,6 +179,11 @@
                              bool Op0IsKill, unsigned Op1, bool Op1IsKill,
                              unsigned Op2, bool Op2IsKill, unsigned Op3,
                              bool Op3IsKill);
+
+  bool shouldOptForSize(const MachineFunction *MF) const {
+    // TODO: Implement PGSO.
+    return MF->getFunction().hasOptSize();
+  }
 };
 
 } // end anonymous namespace.
@@ -3940,7 +3945,7 @@
 
   MachineInstr *Result = XII.foldMemoryOperandImpl(
       *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
-      /*AllowCommute=*/true);
+      /*AllowCommute=*/true, nullptr, nullptr);
   if (!Result)
     return false;
 
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -48,11 +48,14 @@
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
@@ -113,6 +116,8 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
                                        // guide some heuristics.
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -140,6 +145,9 @@
 
   /// Register Liveness information after the current instruction.
   LivePhysRegs LiveRegs;
+
+  ProfileSummaryInfo *PSI;
+  MachineBlockFrequencyInfo *MBFI;
 };
 char FixupBWInstPass::ID = 0;
 }
@@ -154,8 +162,11 @@
 
   this->MF = &MF;
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
-  OptForSize = MF.getFunction().hasOptSize();
   MLI = &getAnalysis<MachineLoopInfo>();
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MBFI = (PSI && PSI->hasProfileSummary()) ?
+         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+         nullptr;
   LiveRegs.init(TII->getRegisterInfo());
 
   LLVM_DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
@@ -426,6 +437,9 @@
   // We run after PEI, so we need to AddPristinesAndCSRs.
   LiveRegs.addLiveOuts(MBB);
 
+  OptForSize = MF.getFunction().hasOptSize() ||
+               llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
+
   for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
     MachineInstr *MI = &*I;
 
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -335,7 +335,7 @@
       // Do not want to hoist if we're not optimizing for size.
       // TODO: We'd like to remove this restriction.
       // See the comment in X86InstrInfo.td for more info.
-      if (!OptForSize)
+      if (!CurDAG->shouldOptForSize())
         return false;
 
       // Walk all the users of the immediate.
@@ -3019,7 +3019,7 @@
    LLVM_FALLTHROUGH;
   case X86ISD::ADD:
     // Try to match inc/dec.
-    if (!Subtarget->slowIncDec() || OptForSize) {
+    if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
       bool IsOne = isOneConstant(StoredVal.getOperand(1));
       bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
       // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -26,6 +26,8 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -8289,7 +8291,7 @@
   // TODO: If multiple splats are generated to load the same constant,
   // it may be detrimental to overall size. There needs to be a way to detect
   // that condition to know if this is truly a size win.
-  bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+  bool OptForSize = DAG.shouldOptForSize();
 
   // Handle broadcasting a single constant scalar from the constant pool
   // into a vector.
@@ -11144,7 +11146,7 @@
   case MVT::v32i16:
   case MVT::v64i8: {
     // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
-    bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+    bool OptForSize = DAG.shouldOptForSize();
     if (!OptForSize) {
       if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
                                                  Subtarget, DAG))
@@ -18282,7 +18284,7 @@
          "Unexpected funnel shift type!");
 
   // Expand slow SHLD/SHRD cases if we are not optimizing for size.
-  bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+  bool OptForSize = DAG.shouldOptForSize();
   if (!OptForSize && Subtarget.isSHLDSlow())
     return SDValue();
 
@@ -19620,7 +19622,7 @@
 /// implementation, and likely shuffle complexity of the alternate sequence.
 static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
                                   const X86Subtarget &Subtarget) {
-  bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
+  bool IsOptimizingSize = DAG.shouldOptForSize();
   bool HasFastHOps = Subtarget.hasFastHorizontalOps();
   return !IsSingleSource || IsOptimizingSize || HasFastHOps;
 }
@@ -20368,7 +20370,7 @@
     } else {
       // Use BT if the immediate can't be encoded in a TEST instruction or we
       // are optimizing for size and the immedaite won't fit in a byte.
-      bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+      bool OptForSize = DAG.shouldOptForSize();
       if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
           isPowerOf2_64(AndRHSVal)) {
         Src = AndLHS;
@@ -39495,7 +39497,7 @@
     return SDValue();
 
   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
-  bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+  bool OptForSize = DAG.shouldOptForSize();
   unsigned Bits = VT.getScalarSizeInBits();
 
   // SHLD/SHRD instructions have lower register pressure, but on some
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -24,7 +24,9 @@
 #include "X86GenInstrInfo.inc"
 
 namespace llvm {
+class MachineBlockFrequencyInfo;
 class MachineInstrBuilder;
+class ProfileSummaryInfo;
 class X86RegisterInfo;
 class X86Subtarget;
 
@@ -341,6 +343,8 @@
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                         ArrayRef<unsigned> Ops,
                         MachineBasicBlock::iterator InsertPt, int FrameIndex,
+                        ProfileSummaryInfo *PSI,
+                        const MachineBlockFrequencyInfo *MBFI,
                         LiveIntervals *LIS = nullptr,
                         VirtRegMap *VRM = nullptr) const override;
 
@@ -350,6 +354,7 @@
   MachineInstr *foldMemoryOperandImpl(
       MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
       MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+      ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI,
       LiveIntervals *LIS = nullptr) const override;
 
   /// unfoldMemoryOperand - Separate a single instruction which folded a load or
@@ -427,9 +432,13 @@
 
   uint16_t getExecutionDomainCustom(const MachineInstr &MI) const;
 
-  void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
+  void setExecutionDomain(MachineInstr &MI, unsigned Domain,
+                          ProfileSummaryInfo *PSI,
+                          const MachineBlockFrequencyInfo *MBFI) const override;
 
-  bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const;
+  bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain,
+                                ProfileSummaryInfo *PSI,
+                                const MachineBlockFrequencyInfo *MBFI) const;
 
   unsigned
   getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
@@ -444,7 +453,9 @@
                                       ArrayRef<MachineOperand> MOs,
                                       MachineBasicBlock::iterator InsertPt,
                                       unsigned Size, unsigned Alignment,
-                                      bool AllowCommute) const;
+                                      bool AllowCommute,
+                                      ProfileSummaryInfo *PSI,
+                                      const MachineBlockFrequencyInfo *MBFI) const;
 
   bool isHighLatencyDef(int opc) const override;
 
@@ -490,7 +501,9 @@
   MachineInstr *optimizeLoadInstr(MachineInstr &MI,
                                   const MachineRegisterInfo *MRI,
                                   unsigned &FoldAsLoadDefReg,
-                                  MachineInstr *&DefMI) const override;
+                                  MachineInstr *&DefMI,
+                                  ProfileSummaryInfo *PSI,
+                                  const MachineBlockFrequencyInfo *MBFI) const override;
 
   std::pair<unsigned, unsigned>
   decomposeMachineOperandsTargetFlags(unsigned TF) const override;
@@ -537,7 +550,10 @@
   /// non-commutable operands.
   /// Even though the instruction is commutable, the method may still
   /// fail to commute the operands, null pointer is returned in such cases.
-  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI,
+                                       ProfileSummaryInfo *PSI,
+                                       const MachineBlockFrequencyInfo *MBFI,
+                                       bool NewMI,
                                        unsigned CommuteOpIdx1,
                                        unsigned CommuteOpIdx2) const override;
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -19,14 +19,17 @@
 #include "X86TargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -1524,7 +1527,10 @@
 #undef VPERM_CASES
 }
 
-MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                   ProfileSummaryInfo *PSI,
+                                                   const MachineBlockFrequencyInfo *MBFI,
+                                                   bool NewMI,
                                                    unsigned OpIdx1,
                                                    unsigned OpIdx2) const {
   auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
@@ -1555,7 +1561,8 @@
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.setDesc(get(Opc));
     WorkingMI.getOperand(3).setImm(Size - Amt);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::PFSUBrr:
@@ -1566,15 +1573,20 @@
         (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.setDesc(get(Opc));
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
   case X86::VBLENDPDrri:
-  case X86::VBLENDPSrri:
+  case X86::VBLENDPSrri: {
     // If we're optimizing for size, try to use MOVSD/MOVSS.
-    if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
+    auto *MBB = MI.getParent();
+    auto MF = MBB->getParent();
+    bool OptForSize = MF->getFunction().hasOptSize() ||
+                      llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
+    if (OptForSize) {
       unsigned Mask, Opc;
       switch (MI.getOpcode()) {
       default: llvm_unreachable("Unreachable!");
@@ -1588,11 +1600,13 @@
         WorkingMI.setDesc(get(Opc));
         WorkingMI.RemoveOperand(3);
         return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
+                                                       PSI, MBFI,
                                                        /*NewMI=*/false,
                                                        OpIdx1, OpIdx2);
       }
     }
     LLVM_FALLTHROUGH;
+  }
   case X86::PBLENDWrri:
   case X86::VBLENDPDYrri:
   case X86::VBLENDPSYrri:
@@ -1621,7 +1635,8 @@
     int8_t Imm = MI.getOperand(3).getImm() & Mask;
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(3).setImm(Mask ^ Imm);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::INSERTPSrr:
@@ -1641,7 +1656,8 @@
       unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
       auto &WorkingMI = cloneIfNew(MI);
       WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
-      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                     /*NewMI=*/false,
                                                      OpIdx1, OpIdx2);
     }
     return nullptr;
@@ -1664,7 +1680,8 @@
       auto &WorkingMI = cloneIfNew(MI);
       WorkingMI.setDesc(get(Opc));
       WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
-      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                     /*NewMI=*/false,
                                                      OpIdx1, OpIdx2);
     }
 
@@ -1675,7 +1692,8 @@
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.setDesc(get(X86::SHUFPDrri));
     WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::SHUFPDrri: {
@@ -1684,7 +1702,8 @@
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.setDesc(get(X86::MOVSDrr));
     WorkingMI.RemoveOperand(3);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::PCLMULQDQrr:
@@ -1700,7 +1719,8 @@
     unsigned Src2Hi = Imm & 0x10;
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::VPCMPBZ128rri:  case X86::VPCMPUBZ128rri:
@@ -1732,7 +1752,8 @@
     Imm = X86::getSwappedVPCMPImm(Imm);
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::VPCOMBri: case X86::VPCOMUBri:
@@ -1744,7 +1765,8 @@
     Imm = X86::getSwappedVPCOMImm(Imm);
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(3).setImm(Imm);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::VCMPSDZrr:
@@ -1765,7 +1787,8 @@
     Imm = X86::getSwappedVCMPImm(Imm);
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::VPERM2F128rr:
@@ -1776,7 +1799,8 @@
     int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::MOVHLPSrr:
@@ -1799,7 +1823,8 @@
     }
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.setDesc(get(Opc));
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::CMOV16rr:  case X86::CMOV32rr:  case X86::CMOV64rr: {
@@ -1807,7 +1832,8 @@
     unsigned OpNo = MI.getDesc().getNumOperands() - 1;
     X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
     WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   case X86::VPTERNLOGDZrri:      case X86::VPTERNLOGDZrmi:
@@ -1842,7 +1868,8 @@
   case X86::VPTERNLOGQZrmbikz: {
     auto &WorkingMI = cloneIfNew(MI);
     commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                   /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
   default: {
@@ -1850,7 +1877,8 @@
       unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
       auto &WorkingMI = cloneIfNew(MI);
       WorkingMI.setDesc(get(Opc));
-      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                     /*NewMI=*/false,
                                                      OpIdx1, OpIdx2);
     }
 
@@ -1861,11 +1889,13 @@
         getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
       auto &WorkingMI = cloneIfNew(MI);
       WorkingMI.setDesc(get(Opc));
-      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, PSI, MBFI,
+                                                     /*NewMI=*/false,
                                                      OpIdx1, OpIdx2);
     }
 
-    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    return TargetInstrInfo::commuteInstructionImpl(MI, PSI, MBFI,
+                                                   NewMI, OpIdx1, OpIdx2);
   }
   }
 }
@@ -3833,7 +3863,9 @@
 MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
                                               const MachineRegisterInfo *MRI,
                                               unsigned &FoldAsLoadDefReg,
-                                              MachineInstr *&DefMI) const {
+                                              MachineInstr *&DefMI,
+                                              ProfileSummaryInfo *PSI,
+                                              const MachineBlockFrequencyInfo *MBFI) const {
   // Check whether we can move DefMI here.
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
@@ -3859,7 +3891,8 @@
     return nullptr;
 
   // Check whether we can fold the def into SrcOperandId.
-  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
+  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI,
+                                               PSI, MBFI)) {
     FoldAsLoadDefReg = 0;
     return FoldMI;
   }
@@ -4822,7 +4855,8 @@
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
     ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
-    unsigned Size, unsigned Align, bool AllowCommute) const {
+    unsigned Size, unsigned Align, bool AllowCommute,
+    ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI) const {
   bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
   bool isTwoAddrFold = false;
 
@@ -4955,7 +4989,7 @@
         return nullptr;
 
       MachineInstr *CommutedMI =
-          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+          commuteInstruction(MI, PSI, MBFI, false, CommuteOpIdx1, CommuteOpIdx2);
       if (!CommutedMI) {
         // Unable to commute.
         return nullptr;
@@ -4968,13 +5002,14 @@
 
       // Attempt to fold with the commuted version of the instruction.
       NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
-                                    Size, Align, /*AllowCommute=*/false);
+                                    Size, Align, /*AllowCommute=*/false,
+                                    PSI, MBFI);
       if (NewMI)
         return NewMI;
 
       // Folding failed again - undo the commute before returning.
       MachineInstr *UncommutedMI =
-          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+          commuteInstruction(MI, PSI, MBFI, false, CommuteOpIdx1, CommuteOpIdx2);
       if (!UncommutedMI) {
         // Unable to commute.
         return nullptr;
@@ -5000,7 +5035,10 @@
 X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                                     ArrayRef<unsigned> Ops,
                                     MachineBasicBlock::iterator InsertPt,
-                                    int FrameIndex, LiveIntervals *LIS,
+                                    int FrameIndex,
+                                    ProfileSummaryInfo *PSI,
+                                    const MachineBlockFrequencyInfo *MBFI,
+                                    LiveIntervals *LIS,
                                     VirtRegMap *VRM) const {
   // Check switch flag
   if (NoFusing)
@@ -5050,7 +5088,8 @@
 
   return foldMemoryOperandImpl(MF, MI, Ops[0],
                                MachineOperand::CreateFI(FrameIndex), InsertPt,
-                               Size, Alignment, /*AllowCommute=*/true);
+                               Size, Alignment, /*AllowCommute=*/true,
+                               PSI, MBFI);
 }
 
 /// Check if \p LoadMI is a partial register load that we can't fold into \p MI
@@ -5191,6 +5230,7 @@
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+    ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI,
     LiveIntervals *LIS) const {
 
   // TODO: Support the case where LoadMI loads a wide register, but MI
@@ -5206,7 +5246,8 @@
   if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
     if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
       return nullptr;
-    return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
+    return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, PSI, MBFI,
+                                 LIS);
   }
 
   // Check switch flag
@@ -5358,7 +5399,8 @@
   }
   }
   return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
-                               /*Size=*/0, Alignment, /*AllowCommute=*/true);
+                               /*Size=*/0, Alignment, /*AllowCommute=*/true,
+                               PSI, MBFI);
 }
 
 static SmallVector<MachineMemOperand *, 2>
@@ -6685,7 +6727,9 @@
 }
 
 bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
-                                            unsigned Domain) const {
+                                            unsigned Domain,
+                                            ProfileSummaryInfo *PSI,
+                                            const MachineBlockFrequencyInfo *MBFI) const {
   assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
   uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   assert(dom && "Not an SSE instruction");
@@ -6794,7 +6838,7 @@
         MI.getOperand(0).getSubReg() == 0 &&
         MI.getOperand(1).getSubReg() == 0 &&
         MI.getOperand(2).getSubReg() == 0) {
-      commuteInstruction(MI, false);
+      commuteInstruction(MI, PSI, MBFI, false);
       return true;
     }
     // We must always return true for MOVHLPSrr.
@@ -6857,13 +6901,15 @@
   return std::make_pair(domain, validDomains);
 }
 
-void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
+void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain,
+                                      ProfileSummaryInfo *PSI,
+                                      const MachineBlockFrequencyInfo *MBFI) const {
   assert(Domain>0 && Domain<4 && "Invalid execution domain");
   uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   assert(dom && "Not an SSE instruction");
 
   // Attempt to match for custom instructions.
-  if (setExecutionDomainCustom(MI, Domain))
+  if (setExecutionDomainCustom(MI, Domain, PSI, MBFI))
     return;
 
   const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -983,12 +983,12 @@
 // the Function object through the <Target>Subtarget and objections were raised
 // to that (see post-commit review comments for r301750).
 let RecomputePerFunction = 1 in {
-  def OptForSize   : Predicate<"MF->getFunction().hasOptSize()">;
+  def OptForSize   : Predicate<"shouldOptForSize(MF)">;
   def OptForMinSize : Predicate<"MF->getFunction().hasMinSize()">;
-  def OptForSpeed  : Predicate<"!MF->getFunction().hasOptSize()">;
+  def OptForSpeed  : Predicate<"!shouldOptForSize(MF)">;
   def UseIncDec : Predicate<"!Subtarget->slowIncDec() || "
-                            "MF->getFunction().hasOptSize()">;
-  def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().hasOptSize() || "
+                            "shouldOptForSize(MF)">;
+  def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || "
                                         "!Subtarget->hasSSE41()">;
 }
 
diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
--- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -25,6 +25,8 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -32,6 +34,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -247,6 +250,12 @@
 
   static char ID;
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
 private:
   using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>;
 
@@ -681,6 +690,11 @@
   MRI = &MF.getRegInfo();
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
   TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+  auto *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
+               &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+               nullptr;
 
   // Process all basic blocks.
   for (auto &MBB : MF) {
@@ -699,7 +713,9 @@
 
     // Remove redundant address calculations. Do it only for -Os/-Oz since only
     // a code size gain is expected from this part of the pass.
-    if (MF.getFunction().hasOptSize())
+    bool OptForSize = MF.getFunction().hasOptSize() ||
+                      llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
+    if (OptForSize)
       Changed |= removeRedundantAddrCalc(LEAs);
   }
 
diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp
--- a/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -17,8 +17,11 @@
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/Function.h"
@@ -52,6 +55,12 @@
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<ProfileSummaryInfoWrapperPass>();
+      AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
     MachineFunctionProperties getRequiredProperties() const override {
       return MachineFunctionProperties().set(
           MachineFunctionProperties::Property::NoVRegs);
@@ -105,6 +114,12 @@
 
   TSM.init(&MF.getSubtarget());
 
+  auto *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
+               &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+               nullptr;
+
   // Search through basic blocks and mark the ones that have early returns
   ReturnBBs.clear();
   VisitedBBs.clear();
@@ -118,6 +133,11 @@
     MachineBasicBlock *MBB = I->first;
     unsigned Cycles = I->second;
 
+    // Function::hasOptSize is already checked above.
+    bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
+    if (OptForSize)
+      continue;
+
     if (Cycles < Threshold) {
       // BB ends in a return. Skip over any DBG_VALUE instructions
       // trailing the terminator.
diff --git a/llvm/lib/Transforms/Utils/SizeOpts.cpp b/llvm/lib/Transforms/Utils/SizeOpts.cpp
--- a/llvm/lib/Transforms/Utils/SizeOpts.cpp
+++ b/llvm/lib/Transforms/Utils/SizeOpts.cpp
@@ -10,28 +10,66 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
+
 using namespace llvm;
 
-static cl::opt<bool> ProfileGuidedSizeOpt(
+cl::opt<bool> EnablePGSO(
     "pgso", cl::Hidden, cl::init(true),
-    cl::desc("Enable the profile guided size optimization. "));
+    cl::desc("Enable the profile guided size optimizations. "));
+
+cl::opt<bool> PGSOLargeWorkingSetSizeOnly(
+    "pgso-lwss-only", cl::Hidden, cl::init(true),
+    cl::desc("Apply the profile guided size optimizations only "
+             "if the working set size is large (except for cold code.)"));
+
+cl::opt<bool> ForcePGSO(
+    "force-pgso", cl::Hidden, cl::init(false),
+    cl::desc("Force the (profiled-guided) size optimizations. "));
+
+cl::opt<int> PgsoCutoffInstrProf(
+    "pgso-cutoff-instr-prof", cl::Hidden, cl::init(250000), cl::ZeroOrMore,
+    cl::desc("The profile guided size optimization profile summary cutoff "
+             "for instrumentation profile."));
+
+cl::opt<int> PgsoCutoffSampleProf(
+    "pgso-cutoff-sample-prof", cl::Hidden, cl::init(800000), cl::ZeroOrMore,
+    cl::desc("The profile guided size optimization profile summary cutoff "
+             "for sample profile."));
+
+namespace {
+struct BasicBlockBFIAdapter {
+  static bool isFunctionColdInCallGraph(const Function *F,
+                                        ProfileSummaryInfo *PSI,
+                                        BlockFrequencyInfo &BFI) {
+    return PSI->isFunctionColdInCallGraph(F, BFI);
+  }
+  static bool isFunctionHotInCallGraphNthPercentile(int CutOff,
+                                                    const Function *F,
+                                                    ProfileSummaryInfo *PSI,
+                                                    BlockFrequencyInfo &BFI) {
+    return PSI->isFunctionHotInCallGraphNthPercentile(CutOff, F, BFI);
+  }
+  static bool isColdBlock(const BasicBlock *BB,
+                          ProfileSummaryInfo *PSI,
+                          BlockFrequencyInfo *BFI) {
+    return PSI->isColdBlock(BB, BFI);
+  }
+  static bool isHotBlockNthPercentile(int CutOff,
+                                      const BasicBlock *BB,
+                                      ProfileSummaryInfo *PSI,
+                                      BlockFrequencyInfo *BFI) {
+    return PSI->isHotBlockNthPercentile(CutOff, BB, BFI);
+  }
+};
+} // end anonymous namespace
 
-bool llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
+bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI,
                                  BlockFrequencyInfo *BFI) {
-  assert(F);
-  if (!PSI || !BFI || !PSI->hasProfileSummary())
-    return false;
-  return ProfileGuidedSizeOpt && PSI->isFunctionColdInCallGraph(F, *BFI);
+  return shouldFuncOptimizeForSizeImpl<BasicBlockBFIAdapter>(F, PSI, BFI);
 }
 
-bool llvm::shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI,
+bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI,
                                  BlockFrequencyInfo *BFI) {
-  assert(BB);
-  if (!PSI || !BFI || !PSI->hasProfileSummary())
-    return false;
-  return ProfileGuidedSizeOpt && PSI->isColdBlock(BB, BFI);
+  return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI);
 }
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -11,6 +11,7 @@
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
@@ -44,13 +45,17 @@
 ; CHECK-NEXT:       Analysis for ComputingKnownBits
 ; CHECK-NEXT:       InstructionSelect
 ; CHECK-NEXT:       ResetMachineFunction
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       AArch64 Instruction Selection
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
 ; CHECK-NEXT:       Local Stack Slot Allocation
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Two-Address instruction pass
 ; CHECK-NEXT:       Fast Register Allocator
-; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Prologue/Epilogue Insertion & Frame Finalization
 ; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -10,8 +10,8 @@
 ; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
-; CHECK-NEXT: Create Garbage Collector Module Metadata
 ; CHECK-NEXT: Profile summary info
+; CHECK-NEXT: Create Garbage Collector Module Metadata
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
@@ -35,6 +35,9 @@
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:       Merge contiguous icmps into a memcmp
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
@@ -76,10 +79,13 @@
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       AArch64 Instruction Selection
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       AArch64 Local Dynamic TLS Access Clean-up
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Early Tail Duplication
 ; CHECK-NEXT:       Optimize machine instruction PHIs
 ; CHECK-NEXT:       Slot index numbering
@@ -91,6 +97,7 @@
 ; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Machine Trace Metrics
 ; CHECK-NEXT:       AArch64 Conditional Compares
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine InstCombiner
 ; CHECK-NEXT:       AArch64 Conditional Branch Tuning
 ; CHECK-NEXT:       Machine Trace Metrics
@@ -106,6 +113,7 @@
 ; CHECK-NEXT:       Machine Common Subexpression Elimination
 ; CHECK-NEXT:       MachinePostDominator Tree Construction
 ; CHECK-NEXT:       Machine code sinking
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Peephole Optimizations
 ; CHECK-NEXT:       Remove dead machine instructions
 ; CHECK-NEXT:       AArch64 Dead register definitions
@@ -114,6 +122,7 @@
 ; CHECK-NEXT:       Remove unreachable machine basic blocks
 ; CHECK-NEXT:       Live Variable Analysis
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Two-Address instruction pass
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Slot index numbering
@@ -128,7 +137,6 @@
 ; CHECK-NEXT:       Live Register Matrix
 ; CHECK-NEXT:       Bundle Machine CFG Edges
 ; CHECK-NEXT:       Spill Code Placement Analysis
-; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Greedy Register Allocator
 ; CHECK-NEXT:       Virtual Register Rewriter
@@ -147,6 +155,7 @@
 ; CHECK-NEXT:       Shrink Wrapping analysis
 ; CHECK-NEXT:       Prologue/Epilogue Insertion & Frame Finalization
 ; CHECK-NEXT:       Control Flow Optimizer
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Tail Duplication
 ; CHECK-NEXT:       Machine Copy Propagation Pass
 ; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll b/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll
@@ -0,0 +1,128 @@
+; RUN: llc %s -enable-machine-outliner=never -mtriple=arm64-linux-gnu -o - | \
+; RUN:   FileCheck --check-prefixes=CHECK,CHECK-LINUX %s
+; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
+
+; CHECK-LABEL: fct1:
+; For small size (<= 256), we do not change memset to bzero.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct1(i8* nocapture %ptr) !prof !14 {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
+
+; CHECK-LABEL: fct2:
+; When the size is bigger than 256, change into bzero.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct2(i8* nocapture %ptr) !prof !14 {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: fct3:
+; For unknown size, change to bzero.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct3(i8* nocapture %ptr, i32 %unknown) !prof !14 {
+entry:
+  %conv = sext i32 %unknown to i64
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: fct4:
+; Size <= 256, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct4(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
+
+declare i64 @llvm.objectsize.i64(i8*, i1)
+
+; CHECK-LABEL: fct5:
+; Size > 256, change.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct5(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK-LABEL: fct6:
+; Size = unknown, change.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct6(i8* %ptr, i32 %unknown) !prof !14 {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp)
+  ret void
+}
+
+; Next functions check that memset is not turned into bzero
+; when the set constant is non-zero, whatever the given size.
+
+; CHECK-LABEL: fct7:
+; memset with something that is not a zero, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct7(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
+  ret void
+}
+
+; CHECK-LABEL: fct8:
+; memset with something that is not a zero, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct8(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK-LABEL: fct9:
+; memset with something that is not a zero, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct9(i8* %ptr, i32 %unknown) !prof !14 {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp)
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll b/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll
--- a/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll
@@ -18,7 +18,6 @@
 ; GreedyRegAlloc, please adjust accordingly.)
 
 ; HOTNESS:      Executing Pass 'Spill Code Placement Analysis'
-; HOTNESS-NEXT: Executing Pass 'Lazy Machine Block Frequency Analysis'
 ; HOTNESS-NEXT: Executing Pass 'Machine Optimization Remark Emitter'
 ; HOTNESS-NEXT: MachineBlockFrequencyInfo is available
 ; HOTNESS-NEXT: Executing Pass 'Greedy Register Allocator'
diff --git a/llvm/test/CodeGen/AArch64/max-jump-table.ll b/llvm/test/CodeGen/AArch64/max-jump-table.ll
--- a/llvm/test/CodeGen/AArch64/max-jump-table.ll
+++ b/llvm/test/CodeGen/AArch64/max-jump-table.ll
@@ -215,3 +215,136 @@
 
 return: ret void
 }
+
+define i32 @jt1_optsize(i32 %a, i32 %b) optsize {
+entry:
+  switch i32 %a, label %return [
+    i32 1,  label %bb1
+    i32 2,  label %bb2
+    i32 3,  label %bb3
+    i32 4,  label %bb4
+    i32 5,  label %bb5
+    i32 6,  label %bb6
+    i32 7,  label %bb7
+    i32 8,  label %bb8
+    i32 9,  label %bb9
+    i32 10, label %bb10
+    i32 11, label %bb11
+    i32 12, label %bb12
+    i32 13, label %bb13
+    i32 14, label %bb14
+    i32 15, label %bb15
+    i32 16, label %bb16
+    i32 17, label %bb17
+  ]
+; CHECK-LABEL: function jt1_optsize:
+; CHECK-NEXT: Jump Tables:
+; CHECK0-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK0-NOT:   %jump-table.1:
+; CHECK4-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK4-NOT:   %jump-table.1:
+; CHECK8-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK8-NOT:   %jump-table.1:
+; CHECK16-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK16-NOT:  %jump-table.1:
+; CHECKM1-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM1-NOT:  %jump-table.1:
+; CHECKM3-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM3-NOT:  %jump-table.1:
+; CHECK-DAG: End machine code for function jt1_optsize.
+
+bb1:  tail call void @ext(i32 1, i32 0)  br label %return
+bb2:  tail call void @ext(i32 2, i32 2)  br label %return
+bb3:  tail call void @ext(i32 3, i32 4)  br label %return
+bb4:  tail call void @ext(i32 4, i32 6)  br label %return
+bb5:  tail call void @ext(i32 5, i32 8)  br label %return
+bb6:  tail call void @ext(i32 6, i32 10) br label %return
+bb7:  tail call void @ext(i32 7, i32 12) br label %return
+bb8:  tail call void @ext(i32 8, i32 14) br label %return
+bb9:  tail call void @ext(i32 9, i32 16) br label %return
+bb10: tail call void @ext(i32 1, i32 18) br label %return
+bb11: tail call void @ext(i32 2, i32 20) br label %return
+bb12: tail call void @ext(i32 3, i32 22) br label %return
+bb13: tail call void @ext(i32 4, i32 24) br label %return
+bb14: tail call void @ext(i32 5, i32 26) br label %return
+bb15: tail call void @ext(i32 6, i32 28) br label %return
+bb16: tail call void @ext(i32 7, i32 30) br label %return
+bb17: tail call void @ext(i32 8, i32 32) br label %return
+
+return: ret i32 %b
+}
+
+define i32 @jt1_pgso(i32 %a, i32 %b) !prof !14 {
+entry:
+  switch i32 %a, label %return [
+    i32 1,  label %bb1
+    i32 2,  label %bb2
+    i32 3,  label %bb3
+    i32 4,  label %bb4
+    i32 5,  label %bb5
+    i32 6,  label %bb6
+    i32 7,  label %bb7
+    i32 8,  label %bb8
+    i32 9,  label %bb9
+    i32 10, label %bb10
+    i32 11, label %bb11
+    i32 12, label %bb12
+    i32 13, label %bb13
+    i32 14, label %bb14
+    i32 15, label %bb15
+    i32 16, label %bb16
+    i32 17, label %bb17
+  ]
+; CHECK-LABEL: function jt1_pgso:
+; CHECK-NEXT: Jump Tables:
+; CHECK0-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK0-NOT:   %jump-table.1:
+; CHECK4-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK4-NOT:   %jump-table.1:
+; CHECK8-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK8-NOT:   %jump-table.1:
+; CHECK16-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK16-NOT:  %jump-table.1:
+; CHECKM1-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM1-NOT:  %jump-table.1:
+; CHECKM3-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM3-NOT:  %jump-table.1:
+; CHECK-DAG: End machine code for function jt1_pgso.
+
+bb1:  tail call void @ext(i32 1, i32 0)  br label %return
+bb2:  tail call void @ext(i32 2, i32 2)  br label %return
+bb3:  tail call void @ext(i32 3, i32 4)  br label %return
+bb4:  tail call void @ext(i32 4, i32 6)  br label %return
+bb5:  tail call void @ext(i32 5, i32 8)  br label %return
+bb6:  tail call void @ext(i32 6, i32 10) br label %return
+bb7:  tail call void @ext(i32 7, i32 12) br label %return
+bb8:  tail call void @ext(i32 8, i32 14) br label %return
+bb9:  tail call void @ext(i32 9, i32 16) br label %return
+bb10: tail call void @ext(i32 1, i32 18) br label %return
+bb11: tail call void @ext(i32 2, i32 20) br label %return
+bb12: tail call void @ext(i32 3, i32 22) br label %return
+bb13: tail call void @ext(i32 4, i32 24) br label %return
+bb14: tail call void @ext(i32 5, i32 26) br label %return
+bb15: tail call void @ext(i32 6, i32 28) br label %return
+bb16: tail call void @ext(i32 7, i32 30) br label %return
+bb17: tail call void @ext(i32 8, i32 32) br label %return
+
+return: ret i32 %b
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -19,6 +19,9 @@
 ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:      Function Alias Analysis Results
 ; CHECK-NEXT:      Merge contiguous icmps into a memcmp
+; CHECK-NEXT:      Natural Loop Information
+; CHECK-NEXT:      Lazy Branch Probability Analysis
+; CHECK-NEXT:      Lazy Block Frequency Analysis
 ; CHECK-NEXT:      Expand memcmp() to load/stores
 ; CHECK-NEXT:      Lower Garbage Collection Instructions
 ; CHECK-NEXT:      Shadow Stack GC Lowering
@@ -63,8 +66,11 @@
 ; CHECK-NEXT:      Function Alias Analysis Results
 ; CHECK-NEXT:      Natural Loop Information
 ; CHECK-NEXT:      Branch Probability Analysis
+; CHECK-NEXT:      Lazy Branch Probability Analysis
+; CHECK-NEXT:      Lazy Block Frequency Analysis
 ; CHECK-NEXT:      ARM Instruction Selection
 ; CHECK-NEXT:      Finalize ISel and expand pseudo-instructions
+; CHECK-NEXT:      Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:      Early Tail Duplication
 ; CHECK-NEXT:      Optimize machine instruction PHIs
 ; CHECK-NEXT:      Slot index numbering
@@ -79,6 +85,7 @@
 ; CHECK-NEXT:      Machine Common Subexpression Elimination
 ; CHECK-NEXT:      MachinePostDominator Tree Construction
 ; CHECK-NEXT:      Machine code sinking
+; CHECK-NEXT:      Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:      Peephole Optimizations
 ; CHECK-NEXT:      Remove dead machine instructions
 ; CHECK-NEXT:      ARM MLA / MLS expansion pass
@@ -91,6 +98,7 @@
 ; CHECK-NEXT:      MachineDominator Tree Construction
 ; CHECK-NEXT:      Machine Natural Loop Construction
 ; CHECK-NEXT:      Eliminate PHI nodes for register allocation
+; CHECK-NEXT:      Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:      Two-Address instruction pass
 ; CHECK-NEXT:      Slot index numbering
 ; CHECK-NEXT:      Live Interval Analysis
@@ -104,7 +112,6 @@
 ; CHECK-NEXT:      Live Register Matrix
 ; CHECK-NEXT:      Bundle Machine CFG Edges
 ; CHECK-NEXT:      Spill Code Placement Analysis
-; CHECK-NEXT:      Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:      Machine Optimization Remark Emitter
 ; CHECK-NEXT:      Greedy Register Allocator
 ; CHECK-NEXT:      Virtual Register Rewriter
@@ -120,11 +127,13 @@
 ; CHECK-NEXT:      Shrink Wrapping analysis
 ; CHECK-NEXT:      Prologue/Epilogue Insertion & Frame Finalization
 ; CHECK-NEXT:      Control Flow Optimizer
+; CHECK-NEXT:      Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:      Tail Duplication
 ; CHECK-NEXT:      Machine Copy Propagation Pass
 ; CHECK-NEXT:      Post-RA pseudo instruction expansion pass
 ; CHECK-NEXT:      ARM load / store optimization pass
 ; CHECK-NEXT:      ReachingDefAnalysis
+; CHECK-NEXT:      Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:      ARM Execution Domain Fix
 ; CHECK-NEXT:      BreakFalseDeps
 ; CHECK-NEXT:      ARM pseudo instruction expansion pass
diff --git a/llvm/test/CodeGen/ARM/constantpool-align.ll b/llvm/test/CodeGen/ARM/constantpool-align.ll
--- a/llvm/test/CodeGen/ARM/constantpool-align.ll
+++ b/llvm/test/CodeGen/ARM/constantpool-align.ll
@@ -17,3 +17,28 @@
   store <4 x i32> <i32 -1, i32 0, i32 0, i32 -1>, <4 x i32>* %p, align 4
   ret void 
 }
+
+; CHECK-LABEL: f_pgso:
+; CHECK: vld1.64 {{.*}}, [r1]
+; CHECK: .p2align 3
+define void @f_pgso(<4 x i32>* %p) !prof !14 {
+  store <4 x i32> <i32 -1, i32 0, i32 0, i32 -1>, <4 x i32>* %p, align 4
+  ret void 
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -23,6 +23,17 @@
   ret void
 }
 
+; Perform tail call optimization for external symbol.
+@dest_pgso = global [2 x i8] zeroinitializer
+define void @caller_extern_pgso(i8* %src) !prof !14 {
+entry:
+; CHECK: caller_extern_pgso
+; CHECK-NOT: call memcpy
+; CHECK: tail memcpy
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @dest_pgso, i32 0, i32 0), i8* %src, i32 7, i1 false)
+  ret void
+}
+
 ; Perform indirect tail call optimization (for function pointer call).
 declare void @callee_indirect1()
 declare void @callee_indirect2()
@@ -146,3 +157,20 @@
   tail call void @callee_nostruct()
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -14,6 +14,7 @@
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
@@ -36,6 +37,10 @@
 ; CHECK-NEXT:       Safe Stack instrumentation pass
 ; CHECK-NEXT:       Insert stack protectors
 ; CHECK-NEXT:       Module Verifier
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       X86 DAG->DAG Instruction Selection
 ; CHECK-NEXT:       X86 PIC Global Base Reg Initialization
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
@@ -45,11 +50,11 @@
 ; CHECK-NEXT:       X86 EFLAGS copy lowering
 ; CHECK-NEXT:       X86 WinAlloca Expander
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Two-Address instruction pass
 ; CHECK-NEXT:       Fast Register Allocator
 ; CHECK-NEXT:       Bundle Machine CFG Edges
 ; CHECK-NEXT:       X86 FP Stackifier
-; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Prologue/Epilogue Insertion & Frame Finalization
 ; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
diff --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll
--- a/llvm/test/CodeGen/X86/O3-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O3-pipeline.ll
@@ -13,8 +13,8 @@
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
-; CHECK-NEXT: Create Garbage Collector Module Metadata
 ; CHECK-NEXT: Profile summary info
+; CHECK-NEXT: Create Garbage Collector Module Metadata
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
@@ -32,6 +32,9 @@
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:       Merge contiguous icmps into a memcmp
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
@@ -63,12 +66,15 @@
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       X86 DAG->DAG Instruction Selection
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Local Dynamic TLS Access Clean-up
 ; CHECK-NEXT:       X86 PIC Global Base Reg Initialization
 ; CHECK-NEXT:        Finalize ISel and expand pseudo-instructions
 ; CHECK-NEXT:       X86 Domain Reassignment Pass
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Early Tail Duplication
 ; CHECK-NEXT:       Optimize machine instruction PHIs
 ; CHECK-NEXT:       Slot index numbering
@@ -79,6 +85,7 @@
 ; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Machine Trace Metrics
 ; CHECK-NEXT:       Early If-Conversion
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine InstCombiner
 ; CHECK-NEXT:       X86 cmov Conversion
 ; CHECK-NEXT:       MachineDominator Tree Construction
@@ -89,10 +96,12 @@
 ; CHECK-NEXT:       Machine Common Subexpression Elimination
 ; CHECK-NEXT:       MachinePostDominator Tree Construction
 ; CHECK-NEXT:       Machine code sinking
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Peephole Optimizations
 ; CHECK-NEXT:       Remove dead machine instructions
 ; CHECK-NEXT:       Live Range Shrink
 ; CHECK-NEXT:       X86 Fixup SetCC
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 LEA Optimize
 ; CHECK-NEXT:       X86 Optimize Call Frame
 ; CHECK-NEXT:       X86 Avoid Store Forwarding Block
@@ -107,6 +116,7 @@
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Two-Address instruction pass
 ; CHECK-NEXT:       Slot index numbering
 ; CHECK-NEXT:       Live Interval Analysis
@@ -120,7 +130,6 @@
 ; CHECK-NEXT:       Live Register Matrix
 ; CHECK-NEXT:       Bundle Machine CFG Edges
 ; CHECK-NEXT:       Spill Code Placement Analysis
-; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Greedy Register Allocator
 ; CHECK-NEXT:       Virtual Register Rewriter
@@ -138,6 +147,7 @@
 ; CHECK-NEXT:       Shrink Wrapping analysis
 ; CHECK-NEXT:       Prologue/Epilogue Insertion & Frame Finalization
 ; CHECK-NEXT:       Control Flow Optimizer
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Tail Duplication
 ; CHECK-NEXT:       Machine Copy Propagation Pass
 ; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
@@ -150,13 +160,16 @@
 ; CHECK-NEXT:       MachinePostDominator Tree Construction
 ; CHECK-NEXT:       Branch Probability Basic Block Placement
 ; CHECK-NEXT:       ReachingDefAnalysis
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 Execution Dependency Fix
 ; CHECK-NEXT:       BreakFalseDeps
 ; CHECK-NEXT:       X86 Indirect Branch Tracking
 ; CHECK-NEXT:       X86 vzeroupper inserter
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Machine Natural Loop Construction
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 Byte/Word Instruction Fixup
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 Atom pad short functions
 ; CHECK-NEXT:       X86 LEA Fixup
 ; CHECK-NEXT:       Compressing EVEX instrs to VEX encoding when possible
diff --git a/llvm/test/CodeGen/X86/atom-pad-short-functions.ll b/llvm/test/CodeGen/X86/atom-pad-short-functions.ll
--- a/llvm/test/CodeGen/X86/atom-pad-short-functions.ll
+++ b/llvm/test/CodeGen/X86/atom-pad-short-functions.ll
@@ -29,6 +29,13 @@
   ret i32 %a
 }
 
+define i32 @test_pgso(i32 %a) nounwind !prof !14 {
+; CHECK: test_pgso
+; CHECK: movl
+; CHECK-NEXT: ret
+  ret i32 %a
+}
+
 define i32 @test_add(i32 %a, i32 %b) nounwind {
 ; CHECK: test_add
 ; CHECK: addl
@@ -101,3 +108,19 @@
   ret void
 }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/avx-cvt.ll b/llvm/test/CodeGen/X86/avx-cvt.ll
--- a/llvm/test/CodeGen/X86/avx-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx-cvt.ll
@@ -190,6 +190,16 @@
   ret float %res
 }
 
+define float @floor_f32_load_pgso(float* %aptr) !prof !14 {
+; CHECK-LABEL: floor_f32_load_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vroundss $9, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = load float, float* %aptr
+  %res = call float @llvm.floor.f32(float %a)
+  ret float %res
+}
+
 define double @nearbyint_f64_load(double* %aptr) optsize {
 ; CHECK-LABEL: nearbyint_f64_load:
 ; CHECK:       # %bb.0:
@@ -200,3 +210,29 @@
   ret double %res
 }
 
+define double @nearbyint_f64_load_pgso(double* %aptr) !prof !14 {
+; CHECK-LABEL: nearbyint_f64_load_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vroundsd $12, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = load double, double* %aptr
+  %res = call double @llvm.nearbyint.f64(double %a)
+  ret double %res
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1970,6 +1970,47 @@
   ret <32 x i16> %ret
 }
 
+define <32 x i16> @test_build_vec_v32i1_pgso(<32 x i16> %x) !prof !14 {
+; KNL-LABEL: test_build_vec_v32i1_pgso:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; KNL-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_build_vec_v32i1_pgso:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    movl $1497715861, %eax ## imm = 0x59455495
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_build_vec_v32i1_pgso:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    movl $1497715861, %eax ## imm = 0x59455495
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_build_vec_v32i1_pgso:
+; AVX512DQ:       ## %bb.0:
+; AVX512DQ-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; X86-LABEL: test_build_vec_v32i1_pgso:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl $1497715861, %eax ## imm = 0x59455495
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
 define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
 ; KNL-LABEL: test_build_vec_v64i1:
 ; KNL:       ## %bb.0:
@@ -2013,12 +2054,12 @@
 ; KNL-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
-; KNL-NEXT:    je LBB43_2
+; KNL-NEXT:    je LBB44_2
 ; KNL-NEXT:  ## %bb.1: ## %L1
 ; KNL-NEXT:    vmovapd %zmm0, (%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB43_2: ## %L2
+; KNL-NEXT:  LBB44_2: ## %L2
 ; KNL-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -2029,12 +2070,12 @@
 ; SKX-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
 ; SKX-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
 ; SKX-NEXT:    kortestb %k0, %k0
-; SKX-NEXT:    je LBB43_2
+; SKX-NEXT:    je LBB44_2
 ; SKX-NEXT:  ## %bb.1: ## %L1
 ; SKX-NEXT:    vmovapd %zmm0, (%rdi)
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB43_2: ## %L2
+; SKX-NEXT:  LBB44_2: ## %L2
 ; SKX-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -2046,12 +2087,12 @@
 ; AVX512BW-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb %al, %al
-; AVX512BW-NEXT:    je LBB43_2
+; AVX512BW-NEXT:    je LBB44_2
 ; AVX512BW-NEXT:  ## %bb.1: ## %L1
 ; AVX512BW-NEXT:    vmovapd %zmm0, (%rdi)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB43_2: ## %L2
+; AVX512BW-NEXT:  LBB44_2: ## %L2
 ; AVX512BW-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2062,12 +2103,12 @@
 ; AVX512DQ-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
 ; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
 ; AVX512DQ-NEXT:    kortestb %k0, %k0
-; AVX512DQ-NEXT:    je LBB43_2
+; AVX512DQ-NEXT:    je LBB44_2
 ; AVX512DQ-NEXT:  ## %bb.1: ## %L1
 ; AVX512DQ-NEXT:    vmovapd %zmm0, (%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB43_2: ## %L2
+; AVX512DQ-NEXT:  LBB44_2: ## %L2
 ; AVX512DQ-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -2079,12 +2120,12 @@
 ; X86-NEXT:    vmovupd 8(%eax), %zmm1 {%k1} {z}
 ; X86-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
 ; X86-NEXT:    kortestb %k0, %k0
-; X86-NEXT:    je LBB43_2
+; X86-NEXT:    je LBB44_2
 ; X86-NEXT:  ## %bb.1: ## %L1
 ; X86-NEXT:    vmovapd %zmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB43_2: ## %L2
+; X86-NEXT:  LBB44_2: ## %L2
 ; X86-NEXT:    vmovapd %zmm0, 8(%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -2131,13 +2172,13 @@
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    orl %eax, %ecx
-; KNL-NEXT:    je LBB44_2
+; KNL-NEXT:    je LBB45_2
 ; KNL-NEXT:  ## %bb.1: ## %L1
 ; KNL-NEXT:    vmovaps %zmm0, (%rdi)
 ; KNL-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB44_2: ## %L2
+; KNL-NEXT:  LBB45_2: ## %L2
 ; KNL-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; KNL-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; KNL-NEXT:    vzeroupper
@@ -2154,13 +2195,13 @@
 ; SKX-NEXT:    vcmpltps %zmm2, %zmm1, %k2
 ; SKX-NEXT:    kunpckwd %k1, %k2, %k1
 ; SKX-NEXT:    kortestd %k1, %k0
-; SKX-NEXT:    je LBB44_2
+; SKX-NEXT:    je LBB45_2
 ; SKX-NEXT:  ## %bb.1: ## %L1
 ; SKX-NEXT:    vmovaps %zmm0, (%rdi)
 ; SKX-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB44_2: ## %L2
+; SKX-NEXT:  LBB45_2: ## %L2
 ; SKX-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; SKX-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; SKX-NEXT:    vzeroupper
@@ -2177,13 +2218,13 @@
 ; AVX512BW-NEXT:    vcmpltps %zmm2, %zmm1, %k2
 ; AVX512BW-NEXT:    kunpckwd %k1, %k2, %k1
 ; AVX512BW-NEXT:    kortestd %k1, %k0
-; AVX512BW-NEXT:    je LBB44_2
+; AVX512BW-NEXT:    je LBB45_2
 ; AVX512BW-NEXT:  ## %bb.1: ## %L1
 ; AVX512BW-NEXT:    vmovaps %zmm0, (%rdi)
 ; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB44_2: ## %L2
+; AVX512BW-NEXT:  LBB45_2: ## %L2
 ; AVX512BW-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; AVX512BW-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; AVX512BW-NEXT:    vzeroupper
@@ -2203,13 +2244,13 @@
 ; AVX512DQ-NEXT:    kmovw %k0, %ecx
 ; AVX512DQ-NEXT:    shll $16, %ecx
 ; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    je LBB44_2
+; AVX512DQ-NEXT:    je LBB45_2
 ; AVX512DQ-NEXT:  ## %bb.1: ## %L1
 ; AVX512DQ-NEXT:    vmovaps %zmm0, (%rdi)
 ; AVX512DQ-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB44_2: ## %L2
+; AVX512DQ-NEXT:  LBB45_2: ## %L2
 ; AVX512DQ-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; AVX512DQ-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -2227,13 +2268,13 @@
 ; X86-NEXT:    vcmpltps %zmm2, %zmm1, %k2
 ; X86-NEXT:    kunpckwd %k1, %k2, %k1
 ; X86-NEXT:    kortestd %k1, %k0
-; X86-NEXT:    je LBB44_2
+; X86-NEXT:    je LBB45_2
 ; X86-NEXT:  ## %bb.1: ## %L1
 ; X86-NEXT:    vmovaps %zmm0, (%eax)
 ; X86-NEXT:    vmovaps %zmm1, 64(%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB44_2: ## %L2
+; X86-NEXT:  LBB45_2: ## %L2
 ; X86-NEXT:    vmovaps %zmm0, 4(%eax)
 ; X86-NEXT:    vmovaps %zmm1, 68(%eax)
 ; X86-NEXT:    vzeroupper
@@ -4188,12 +4229,12 @@
 ; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testw %ax, %ax
-; KNL-NEXT:    jle LBB65_1
+; KNL-NEXT:    jle LBB66_1
 ; KNL-NEXT:  ## %bb.2: ## %bb.2
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB65_1: ## %bb.1
+; KNL-NEXT:  LBB66_1: ## %bb.1
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4207,12 +4248,12 @@
 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    testw %ax, %ax
-; SKX-NEXT:    jle LBB65_1
+; SKX-NEXT:    jle LBB66_1
 ; SKX-NEXT:  ## %bb.2: ## %bb.2
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB65_1: ## %bb.1
+; SKX-NEXT:  LBB66_1: ## %bb.1
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4226,12 +4267,12 @@
 ; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testw %ax, %ax
-; AVX512BW-NEXT:    jle LBB65_1
+; AVX512BW-NEXT:    jle LBB66_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %bb.2
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB65_1: ## %bb.1
+; AVX512BW-NEXT:  LBB66_1: ## %bb.1
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4245,12 +4286,12 @@
 ; AVX512DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, %eax
 ; AVX512DQ-NEXT:    testw %ax, %ax
-; AVX512DQ-NEXT:    jle LBB65_1
+; AVX512DQ-NEXT:    jle LBB66_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %bb.2
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB65_1: ## %bb.1
+; AVX512DQ-NEXT:  LBB66_1: ## %bb.1
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4264,12 +4305,12 @@
 ; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; X86-NEXT:    kmovd %k0, %eax
 ; X86-NEXT:    testw %ax, %ax
-; X86-NEXT:    jle LBB65_1
+; X86-NEXT:    jle LBB66_1
 ; X86-NEXT:  ## %bb.2: ## %bb.2
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB65_1: ## %bb.1
+; X86-NEXT:  LBB66_1: ## %bb.1
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4297,11 +4338,11 @@
 ; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; CHECK-NEXT:    kortestw %k0, %k0
-; CHECK-NEXT:    jb LBB66_2
+; CHECK-NEXT:    jb LBB67_2
 ; CHECK-NEXT:  ## %bb.1: ## %bb.1
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _foo
-; CHECK-NEXT:  LBB66_2: ## %bb.2
+; CHECK-NEXT:  LBB67_2: ## %bb.2
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -4313,11 +4354,11 @@
 ; X86-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; X86-NEXT:    kortestw %k0, %k0
-; X86-NEXT:    jb LBB66_2
+; X86-NEXT:    jb LBB67_2
 ; X86-NEXT:  ## %bb.1: ## %bb.1
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
-; X86-NEXT:  LBB66_2: ## %bb.2
+; X86-NEXT:  LBB67_2: ## %bb.2
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -4505,12 +4546,12 @@
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
-; KNL-NEXT:    je LBB72_1
+; KNL-NEXT:    je LBB73_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB72_1: ## %bar
+; KNL-NEXT:  LBB73_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4527,12 +4568,12 @@
 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k2
 ; SKX-NEXT:    korb %k2, %k1, %k1
 ; SKX-NEXT:    ktestb %k1, %k0
-; SKX-NEXT:    je LBB72_1
+; SKX-NEXT:    je LBB73_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB72_1: ## %bar
+; SKX-NEXT:  LBB73_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4555,12 +4596,12 @@
 ; AVX512BW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb %al, %al
-; AVX512BW-NEXT:    je LBB72_1
+; AVX512BW-NEXT:    je LBB73_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB72_1: ## %bar
+; AVX512BW-NEXT:  LBB73_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4581,12 +4622,12 @@
 ; AVX512DQ-NEXT:    korb %k1, %k0, %k0
 ; AVX512DQ-NEXT:    korb %k3, %k2, %k1
 ; AVX512DQ-NEXT:    ktestb %k1, %k0
-; AVX512DQ-NEXT:    je LBB72_1
+; AVX512DQ-NEXT:    je LBB73_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB72_1: ## %bar
+; AVX512DQ-NEXT:  LBB73_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4603,12 +4644,12 @@
 ; X86-NEXT:    vptestnmd %ymm3, %ymm3, %k2
 ; X86-NEXT:    korb %k2, %k1, %k1
 ; X86-NEXT:    ktestb %k1, %k0
-; X86-NEXT:    je LBB72_1
+; X86-NEXT:    je LBB73_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB72_1: ## %bar
+; X86-NEXT:  LBB73_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4646,12 +4687,12 @@
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
-; KNL-NEXT:    je LBB73_1
+; KNL-NEXT:    je LBB74_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB73_1: ## %bar
+; KNL-NEXT:  LBB74_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4668,12 +4709,12 @@
 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k2
 ; SKX-NEXT:    korb %k2, %k1, %k1
 ; SKX-NEXT:    ktestb %k1, %k0
-; SKX-NEXT:    je LBB73_1
+; SKX-NEXT:    je LBB74_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB73_1: ## %bar
+; SKX-NEXT:  LBB74_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4692,12 +4733,12 @@
 ; AVX512BW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb %al, %al
-; AVX512BW-NEXT:    je LBB73_1
+; AVX512BW-NEXT:    je LBB74_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB73_1: ## %bar
+; AVX512BW-NEXT:  LBB74_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4714,12 +4755,12 @@
 ; AVX512DQ-NEXT:    vptestnmq %zmm3, %zmm3, %k2
 ; AVX512DQ-NEXT:    korb %k2, %k1, %k1
 ; AVX512DQ-NEXT:    ktestb %k1, %k0
-; AVX512DQ-NEXT:    je LBB73_1
+; AVX512DQ-NEXT:    je LBB74_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB73_1: ## %bar
+; AVX512DQ-NEXT:  LBB74_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4736,12 +4777,12 @@
 ; X86-NEXT:    vptestnmq %zmm3, %zmm3, %k2
 ; X86-NEXT:    korb %k2, %k1, %k1
 ; X86-NEXT:    ktestb %k1, %k0
-; X86-NEXT:    je LBB73_1
+; X86-NEXT:    je LBB74_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB73_1: ## %bar
+; X86-NEXT:  LBB74_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4778,12 +4819,12 @@
 ; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kortestw %k0, %k0
-; KNL-NEXT:    je LBB74_1
+; KNL-NEXT:    je LBB75_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB74_1: ## %bar
+; KNL-NEXT:  LBB75_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4800,12 +4841,12 @@
 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k2
 ; SKX-NEXT:    korw %k2, %k1, %k1
 ; SKX-NEXT:    ktestw %k1, %k0
-; SKX-NEXT:    je LBB74_1
+; SKX-NEXT:    je LBB75_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB74_1: ## %bar
+; SKX-NEXT:  LBB75_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4823,12 +4864,12 @@
 ; AVX512BW-NEXT:    korw %k2, %k1, %k1
 ; AVX512BW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kortestw %k0, %k0
-; AVX512BW-NEXT:    je LBB74_1
+; AVX512BW-NEXT:    je LBB75_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB74_1: ## %bar
+; AVX512BW-NEXT:  LBB75_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4845,12 +4886,12 @@
 ; AVX512DQ-NEXT:    vptestnmd %zmm3, %zmm3, %k2
 ; AVX512DQ-NEXT:    korw %k2, %k1, %k1
 ; AVX512DQ-NEXT:    ktestw %k1, %k0
-; AVX512DQ-NEXT:    je LBB74_1
+; AVX512DQ-NEXT:    je LBB75_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB74_1: ## %bar
+; AVX512DQ-NEXT:  LBB75_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4867,12 +4908,12 @@
 ; X86-NEXT:    vptestnmd %zmm3, %zmm3, %k2
 ; X86-NEXT:    korw %k2, %k1, %k1
 ; X86-NEXT:    ktestw %k1, %k0
-; X86-NEXT:    je LBB74_1
+; X86-NEXT:    je LBB75_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB74_1: ## %bar
+; X86-NEXT:  LBB75_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4928,12 +4969,12 @@
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    orl %eax, %ecx
-; KNL-NEXT:    je LBB75_1
+; KNL-NEXT:    je LBB76_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB75_1: ## %bar
+; KNL-NEXT:  LBB76_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4950,12 +4991,12 @@
 ; SKX-NEXT:    vptestnmw %zmm3, %zmm3, %k2
 ; SKX-NEXT:    kord %k2, %k1, %k1
 ; SKX-NEXT:    ktestd %k1, %k0
-; SKX-NEXT:    je LBB75_1
+; SKX-NEXT:    je LBB76_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB75_1: ## %bar
+; SKX-NEXT:  LBB76_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4972,12 +5013,12 @@
 ; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm3, %k2
 ; AVX512BW-NEXT:    kord %k2, %k1, %k1
 ; AVX512BW-NEXT:    ktestd %k1, %k0
-; AVX512BW-NEXT:    je LBB75_1
+; AVX512BW-NEXT:    je LBB76_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB75_1: ## %bar
+; AVX512BW-NEXT:  LBB76_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -5014,12 +5055,12 @@
 ; AVX512DQ-NEXT:    kmovw %k0, %ecx
 ; AVX512DQ-NEXT:    shll $16, %ecx
 ; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    je LBB75_1
+; AVX512DQ-NEXT:    je LBB76_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB75_1: ## %bar
+; AVX512DQ-NEXT:  LBB76_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -5036,12 +5077,12 @@
 ; X86-NEXT:    vptestnmw %zmm3, %zmm3, %k2
 ; X86-NEXT:    kord %k2, %k1, %k1
 ; X86-NEXT:    ktestd %k1, %k0
-; X86-NEXT:    je LBB75_1
+; X86-NEXT:    je LBB76_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB75_1: ## %bar
+; X86-NEXT:  LBB76_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -5121,12 +5162,12 @@
 ; KNL-NEXT:    orl %eax, %edx
 ; KNL-NEXT:    shlq $32, %rdx
 ; KNL-NEXT:    orq %rcx, %rdx
-; KNL-NEXT:    je LBB76_1
+; KNL-NEXT:    je LBB77_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB76_1: ## %bar
+; KNL-NEXT:  LBB77_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -5143,12 +5184,12 @@
 ; SKX-NEXT:    vptestnmb %zmm3, %zmm3, %k2
 ; SKX-NEXT:    korq %k2, %k1, %k1
 ; SKX-NEXT:    ktestq %k1, %k0
-; SKX-NEXT:    je LBB76_1
+; SKX-NEXT:    je LBB77_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB76_1: ## %bar
+; SKX-NEXT:  LBB77_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -5165,12 +5206,12 @@
 ; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm3, %k2
 ; AVX512BW-NEXT:    korq %k2, %k1, %k1
 ; AVX512BW-NEXT:    ktestq %k1, %k0
-; AVX512BW-NEXT:    je LBB76_1
+; AVX512BW-NEXT:    je LBB77_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB76_1: ## %bar
+; AVX512BW-NEXT:  LBB77_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -5231,12 +5272,12 @@
 ; AVX512DQ-NEXT:    orl %eax, %edx
 ; AVX512DQ-NEXT:    shlq $32, %rdx
 ; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    je LBB76_1
+; AVX512DQ-NEXT:    je LBB77_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB76_1: ## %bar
+; AVX512DQ-NEXT:  LBB77_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -5255,12 +5296,12 @@
 ; X86-NEXT:    kandq %k1, %k0, %k0
 ; X86-NEXT:    kshiftrq $32, %k0, %k1
 ; X86-NEXT:    kortestd %k1, %k0
-; X86-NEXT:    je LBB76_1
+; X86-NEXT:    je LBB77_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB76_1: ## %bar
+; X86-NEXT:  LBB77_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -5360,3 +5401,20 @@
   %maskv = insertelement <64 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
   ret <64 x i1> %maskv
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
--- a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
@@ -130,6 +130,24 @@
   ret i64 %div
 }
 
+define i64 @div64_pgso(i64 %a, i64 %b) !prof !15 {
+; CHECK-LABEL: div64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
+; CHECK-NEXT:    retq
+;
+; HUGEWS-LABEL: div64_pgso:
+; HUGEWS:       # %bb.0:
+; HUGEWS-NEXT:    movq %rdi, %rax
+; HUGEWS-NEXT:    cqto
+; HUGEWS-NEXT:    idivq %rsi
+; HUGEWS-NEXT:    retq
+  %div = sdiv i64 %a, %b
+  ret i64 %div
+}
+
 define i64 @div64_hugews(i64 %a, i64 %b) {
 ; ATOM-LABEL: div64_hugews:
 ; ATOM:       # %bb.0:
@@ -137,12 +155,12 @@
 ; ATOM-NEXT:    movq %rdi, %rax
 ; ATOM-NEXT:    orq %rsi, %rcx
 ; ATOM-NEXT:    shrq $32, %rcx
-; ATOM-NEXT:    je .LBB3_1
+; ATOM-NEXT:    je .LBB4_1
 ; ATOM-NEXT:  # %bb.2:
 ; ATOM-NEXT:    cqto
 ; ATOM-NEXT:    idivq %rsi
 ; ATOM-NEXT:    retq
-; ATOM-NEXT:  .LBB3_1:
+; ATOM-NEXT:  .LBB4_1:
 ; ATOM-NEXT:    # kill: def $eax killed $eax killed $rax
 ; ATOM-NEXT:    xorl %edx, %edx
 ; ATOM-NEXT:    divl %esi
@@ -155,12 +173,12 @@
 ; SLM-NEXT:    movq %rdi, %rax
 ; SLM-NEXT:    orq %rsi, %rcx
 ; SLM-NEXT:    shrq $32, %rcx
-; SLM-NEXT:    je .LBB3_1
+; SLM-NEXT:    je .LBB4_1
 ; SLM-NEXT:  # %bb.2:
 ; SLM-NEXT:    cqto
 ; SLM-NEXT:    idivq %rsi
 ; SLM-NEXT:    retq
-; SLM-NEXT:  .LBB3_1:
+; SLM-NEXT:  .LBB4_1:
 ; SLM-NEXT:    xorl %edx, %edx
 ; SLM-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SLM-NEXT:    divl %esi
@@ -173,12 +191,12 @@
 ; SKL-NEXT:    movq %rdi, %rcx
 ; SKL-NEXT:    orq %rsi, %rcx
 ; SKL-NEXT:    shrq $32, %rcx
-; SKL-NEXT:    je .LBB3_1
+; SKL-NEXT:    je .LBB4_1
 ; SKL-NEXT:  # %bb.2:
 ; SKL-NEXT:    cqto
 ; SKL-NEXT:    idivq %rsi
 ; SKL-NEXT:    retq
-; SKL-NEXT:  .LBB3_1:
+; SKL-NEXT:  .LBB4_1:
 ; SKL-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SKL-NEXT:    xorl %edx, %edx
 ; SKL-NEXT:    divl %esi
@@ -213,6 +231,24 @@
   ret i32 %div
 }
 
+define i32 @div32_pgso(i32 %a, i32 %b) !prof !15 {
+; CHECK-LABEL: div32_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %esi
+; CHECK-NEXT:    retq
+;
+; HUGEWS-LABEL: div32_pgso:
+; HUGEWS:       # %bb.0:
+; HUGEWS-NEXT:    movl %edi, %eax
+; HUGEWS-NEXT:    cltd
+; HUGEWS-NEXT:    idivl %esi
+; HUGEWS-NEXT:    retq
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
+
 define i32 @div32_minsize(i32 %a, i32 %b) minsize {
 ; CHECK-LABEL: div32_minsize:
 ; CHECK:       # %bb.0:
@@ -246,3 +282,4 @@
 !12 = !{i32 10000, i64 1000, i32 1}
 !13 = !{i32 999000, i64 1000, i32 3}
 !14 = !{i32 999999, i64 5, i32 3}
+!15 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/cmov-into-branch.ll b/llvm/test/CodeGen/X86/cmov-into-branch.ll
--- a/llvm/test/CodeGen/X86/cmov-into-branch.ll
+++ b/llvm/test/CodeGen/X86/cmov-into-branch.ll
@@ -88,7 +88,7 @@
 ; CHECK-NEXT:    cmovnel %edi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
   ret i32 %sel
 }
 
@@ -104,7 +104,7 @@
 ; CHECK-NEXT:  .LBB6_2: # %select.end
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !1
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
   ret i32 %sel
 }
 
@@ -124,7 +124,7 @@
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !17
   ret i32 %sel
 }
 
@@ -137,12 +137,51 @@
 ; CHECK-NEXT:    cmovnel %edi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !3
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !18
   ret i32 %sel
 }
 
-!0 = !{!"branch_weights", i32 1, i32 99}
-!1 = !{!"branch_weights", i32 1, i32 100}
-!2 = !{!"branch_weights", i32 100, i32 1}
-!3 = !{!"branch_weights", i32 0, i32 0}
+define i32 @weighted_select_optsize(i32 %a, i32 %b) optsize {
+; CHECK-LABEL: weighted_select_optsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ne i32 %a, 0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
+  ret i32 %sel
+}
+
+define i32 @weighted_select_pgso(i32 %a, i32 %b) !prof !14 {
+; CHECK-LABEL: weighted_select_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ne i32 %a, 0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
+  ret i32 %sel
+}
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
+!15 = !{!"branch_weights", i32 1, i32 99}
+!16 = !{!"branch_weights", i32 1, i32 100}
+!17 = !{!"branch_weights", i32 100, i32 1}
+!18 = !{!"branch_weights", i32 0, i32 0}
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux   -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32
+; RUN: llc < %s -mtriple=x86_64-linux -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64
+; RUN: llc < %s -mtriple=x86_64-win32 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=WIN64
+
+declare void @foo()
+declare void @bar()
+
+define void @f(i32 %x, i32 %y) !prof !14 {
+; CHECK32-LABEL: f:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; CHECK32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x08]
+; CHECK32-NEXT:    jne bar # TAILCALL
+; CHECK32-NEXT:    # encoding: [0x75,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.1: # %bb1
+; CHECK32-NEXT:    jmp foo # TAILCALL
+; CHECK32-NEXT:    # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: f:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
+; CHECK64-NEXT:    jne bar # TAILCALL
+; CHECK64-NEXT:    # encoding: [0x75,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.1: # %bb1
+; CHECK64-NEXT:    jmp foo # TAILCALL
+; CHECK64-NEXT:    # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: f:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; WIN64-NEXT:    jne bar # TAILCALL
+; WIN64-NEXT:    # encoding: [0x75,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.1: # %bb1
+; WIN64-NEXT:    jmp foo # TAILCALL
+; WIN64-NEXT:    # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+entry:
+	%p = icmp eq i32 %x, %y
+  br i1 %p, label %bb1, label %bb2
+bb1:
+  tail call void @foo()
+  ret void
+bb2:
+  tail call void @bar()
+  ret void
+
+; Check that the asm doesn't just look good, but uses the correct encoding.
+}
+
+define void @f_non_leaf(i32 %x, i32 %y) !prof !14 {
+; CHECK32-LABEL: f_non_leaf:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    pushl %ebx # encoding: [0x53]
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    .cfi_offset %ebx, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; CHECK32-NEXT:    #APP
+; CHECK32-NEXT:    #NO_APP
+; CHECK32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x0c]
+; CHECK32-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.1: # %bb1
+; CHECK32-NEXT:    popl %ebx # encoding: [0x5b]
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    jmp foo # TAILCALL
+; CHECK32-NEXT:    # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  .LBB1_2: # %bb2
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebx # encoding: [0x5b]
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    jmp bar # TAILCALL
+; CHECK32-NEXT:    # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: f_non_leaf:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    .cfi_offset %rbx, -16
+; CHECK64-NEXT:    #APP
+; CHECK64-NEXT:    #NO_APP
+; CHECK64-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
+; CHECK64-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.1: # %bb1
+; CHECK64-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK64-NEXT:    .cfi_def_cfa_offset 8
+; CHECK64-NEXT:    jmp foo # TAILCALL
+; CHECK64-NEXT:    # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  .LBB1_2: # %bb2
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK64-NEXT:    .cfi_def_cfa_offset 8
+; CHECK64-NEXT:    jmp bar # TAILCALL
+; CHECK64-NEXT:    # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: f_non_leaf:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    pushq %rbx # encoding: [0x53]
+; WIN64-NEXT:    .seh_pushreg %rbx
+; WIN64-NEXT:    .seh_endprologue
+; WIN64-NEXT:    #APP
+; WIN64-NEXT:    #NO_APP
+; WIN64-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; WIN64-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.1: # %bb1
+; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
+; WIN64-NEXT:    jmp foo # TAILCALL
+; WIN64-NEXT:    # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+; WIN64-NEXT:  .LBB1_2: # %bb2
+; WIN64-NEXT:    nop # encoding: [0x90]
+; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
+; WIN64-NEXT:    jmp bar # TAILCALL
+; WIN64-NEXT:    # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; WIN64-NEXT:    .seh_handlerdata
+; WIN64-NEXT:    .text
+; WIN64-NEXT:    .seh_endproc
+entry:
+  ; Force %ebx to be spilled on the stack, turning this into
+  ; not a "leaf" function for Win64.
+  tail call void asm sideeffect "", "~{ebx}"()
+
+	%p = icmp eq i32 %x, %y
+  br i1 %p, label %bb1, label %bb2
+bb1:
+  tail call void @foo()
+  ret void
+bb2:
+  tail call void @bar()
+  ret void
+
+}
+
+declare x86_thiscallcc zeroext i1 @baz(i8*, i32)
+define x86_thiscallcc zeroext i1 @BlockPlacementTest(i8* %this, i32 %x) !prof !14 {
+; CHECK32-LABEL: BlockPlacementTest:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
+; CHECK32-NEXT:    testb $42, %dl # encoding: [0xf6,0xc2,0x2a]
+; CHECK32-NEXT:    je .LBB2_3 # encoding: [0x74,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.1: # %land.rhs
+; CHECK32-NEXT:    movb $1, %al # encoding: [0xb0,0x01]
+; CHECK32-NEXT:    testb $44, %dl # encoding: [0xf6,0xc2,0x2c]
+; CHECK32-NEXT:    je baz # TAILCALL
+; CHECK32-NEXT:    # encoding: [0x74,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  .LBB2_2: # %land.end
+; CHECK32-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK32-NEXT:    retl $4 # encoding: [0xc2,0x04,0x00]
+; CHECK32-NEXT:  .LBB2_3:
+; CHECK32-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK32-NEXT:    jmp .LBB2_2 # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: BlockPlacementTest:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    testb $42, %sil # encoding: [0x40,0xf6,0xc6,0x2a]
+; CHECK64-NEXT:    je .LBB2_3 # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.1: # %land.rhs
+; CHECK64-NEXT:    movb $1, %al # encoding: [0xb0,0x01]
+; CHECK64-NEXT:    testb $44, %sil # encoding: [0x40,0xf6,0xc6,0x2c]
+; CHECK64-NEXT:    je baz # TAILCALL
+; CHECK64-NEXT:    # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  .LBB2_2: # %land.end
+; CHECK64-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK64-NEXT:    retq # encoding: [0xc3]
+; CHECK64-NEXT:  .LBB2_3:
+; CHECK64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK64-NEXT:    jmp .LBB2_2 # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: BlockPlacementTest:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    testb $42, %dl # encoding: [0xf6,0xc2,0x2a]
+; WIN64-NEXT:    je .LBB2_3 # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.1: # %land.rhs
+; WIN64-NEXT:    movb $1, %al # encoding: [0xb0,0x01]
+; WIN64-NEXT:    testb $44, %dl # encoding: [0xf6,0xc2,0x2c]
+; WIN64-NEXT:    je baz # TAILCALL
+; WIN64-NEXT:    # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1
+; WIN64-NEXT:  .LBB2_2: # %land.end
+; WIN64-NEXT:    # kill: def $al killed $al killed $eax
+; WIN64-NEXT:    retq # encoding: [0xc3]
+; WIN64-NEXT:  .LBB2_3:
+; WIN64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; WIN64-NEXT:    jmp .LBB2_2 # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
+entry:
+  %and = and i32 %x, 42
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %land.end, label %land.rhs
+
+land.rhs:
+  %and6 = and i32 %x, 44
+  %tobool7 = icmp eq i32 %and6, 0
+  br i1 %tobool7, label %lor.rhs, label %land.end
+
+lor.rhs:
+  %call = tail call x86_thiscallcc zeroext i1 @baz(i8* %this, i32 %x) #2
+  br label %land.end
+
+land.end:
+  %0 = phi i1 [ false, %entry ], [ true, %land.rhs ], [ %call, %lor.rhs ]
+  ret i1 %0
+
+; Make sure machine block placement isn't confused by the conditional tail call,
+; but sees that it can fall through to the next block.
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/fixup-lea.ll b/llvm/test/CodeGen/X86/fixup-lea.ll
--- a/llvm/test/CodeGen/X86/fixup-lea.ll
+++ b/llvm/test/CodeGen/X86/fixup-lea.ll
@@ -108,17 +108,96 @@
   ret void
 }
 
+define void @foo_pgso(i32 inreg %dns) !prof !14 {
+; SLOW-LABEL: foo_pgso:
+; SLOW:       # %bb.0: # %entry
+; SLOW-NEXT:    xorl %ecx, %ecx
+; SLOW-NEXT:    decl %ecx
+; SLOW-NEXT:  .LBB4_1: # %for.body
+; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
+; SLOW-NEXT:    movzwl %cx, %edx
+; SLOW-NEXT:    decl %ecx
+; SLOW-NEXT:    cmpl %eax, %edx
+; SLOW-NEXT:    jl .LBB4_1
+; SLOW-NEXT:  # %bb.2: # %for.end
+; SLOW-NEXT:    retl
+;
+; FAST-LABEL: foo_pgso:
+; FAST:       # %bb.0: # %entry
+; FAST-NEXT:    xorl %ecx, %ecx
+; FAST-NEXT:    decl %ecx
+; FAST-NEXT:  .LBB4_1: # %for.body
+; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; FAST-NEXT:    movzwl %cx, %edx
+; FAST-NEXT:    addl $-1, %ecx
+; FAST-NEXT:    cmpl %eax, %edx
+; FAST-NEXT:    jl .LBB4_1
+; FAST-NEXT:  # %bb.2: # %for.end
+; FAST-NEXT:    retl
+entry:
+  br label %for.body
+
+for.body:
+  %i.05 = phi i16 [ %dec, %for.body ], [ 0, %entry ]
+  %dec = add i16 %i.05, -1
+  %conv = zext i16 %dec to i32
+  %cmp = icmp slt i32 %conv, %dns
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+define void @bar_pgso(i32 inreg %dns) !prof !14 {
+; SLOW-LABEL: bar_pgso:
+; SLOW:       # %bb.0: # %entry
+; SLOW-NEXT:    xorl %ecx, %ecx
+; SLOW-NEXT:    incl %ecx
+; SLOW-NEXT:  .LBB5_1: # %for.body
+; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
+; SLOW-NEXT:    movzwl %cx, %edx
+; SLOW-NEXT:    incl %ecx
+; SLOW-NEXT:    cmpl %eax, %edx
+; SLOW-NEXT:    jl .LBB5_1
+; SLOW-NEXT:  # %bb.2: # %for.end
+; SLOW-NEXT:    retl
+;
+; FAST-LABEL: bar_pgso:
+; FAST:       # %bb.0: # %entry
+; FAST-NEXT:    xorl %ecx, %ecx
+; FAST-NEXT:    incl %ecx
+; FAST-NEXT:  .LBB5_1: # %for.body
+; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; FAST-NEXT:    movzwl %cx, %edx
+; FAST-NEXT:    addl $1, %ecx
+; FAST-NEXT:    cmpl %eax, %edx
+; FAST-NEXT:    jl .LBB5_1
+; FAST-NEXT:  # %bb.2: # %for.end
+; FAST-NEXT:    retl
+entry:
+  br label %for.body
+
+for.body:
+  %i.05 = phi i16 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i16 %i.05, 1
+  %conv = zext i16 %inc to i32
+  %cmp = icmp slt i32 %conv, %dns
+  br i1 %cmp, label %for.body, label %for.end
+for.end:
+  ret void
+}
+
 define void @foo_nosize(i32 inreg %dns) {
 ; SLOW-LABEL: foo_nosize:
 ; SLOW:       # %bb.0: # %entry
 ; SLOW-NEXT:    movw $-1, %cx
 ; SLOW-NEXT:    .p2align 4, 0x90
-; SLOW-NEXT:  .LBB4_1: # %for.body
+; SLOW-NEXT:  .LBB6_1: # %for.body
 ; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SLOW-NEXT:    movzwl %cx, %edx
 ; SLOW-NEXT:    decl %ecx
 ; SLOW-NEXT:    cmpl %eax, %edx
-; SLOW-NEXT:    jl .LBB4_1
+; SLOW-NEXT:    jl .LBB6_1
 ; SLOW-NEXT:  # %bb.2: # %for.end
 ; SLOW-NEXT:    retl
 ;
@@ -126,12 +205,12 @@
 ; FAST:       # %bb.0: # %entry
 ; FAST-NEXT:    movw $-1, %cx
 ; FAST-NEXT:    .p2align 4, 0x90
-; FAST-NEXT:  .LBB4_1: # %for.body
+; FAST-NEXT:  .LBB6_1: # %for.body
 ; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
 ; FAST-NEXT:    movzwl %cx, %edx
 ; FAST-NEXT:    addl $-1, %ecx
 ; FAST-NEXT:    cmpl %eax, %edx
-; FAST-NEXT:    jl .LBB4_1
+; FAST-NEXT:    jl .LBB6_1
 ; FAST-NEXT:  # %bb.2: # %for.end
 ; FAST-NEXT:    retl
 entry:
@@ -153,12 +232,12 @@
 ; SLOW:       # %bb.0: # %entry
 ; SLOW-NEXT:    movw $1, %cx
 ; SLOW-NEXT:    .p2align 4, 0x90
-; SLOW-NEXT:  .LBB5_1: # %for.body
+; SLOW-NEXT:  .LBB7_1: # %for.body
 ; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SLOW-NEXT:    movzwl %cx, %edx
 ; SLOW-NEXT:    incl %ecx
 ; SLOW-NEXT:    cmpl %eax, %edx
-; SLOW-NEXT:    jl .LBB5_1
+; SLOW-NEXT:    jl .LBB7_1
 ; SLOW-NEXT:  # %bb.2: # %for.end
 ; SLOW-NEXT:    retl
 ;
@@ -166,12 +245,12 @@
 ; FAST:       # %bb.0: # %entry
 ; FAST-NEXT:    movw $1, %cx
 ; FAST-NEXT:    .p2align 4, 0x90
-; FAST-NEXT:  .LBB5_1: # %for.body
+; FAST-NEXT:  .LBB7_1: # %for.body
 ; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
 ; FAST-NEXT:    movzwl %cx, %edx
 ; FAST-NEXT:    addl $1, %ecx
 ; FAST-NEXT:    cmpl %eax, %edx
-; FAST-NEXT:    jl .LBB5_1
+; FAST-NEXT:    jl .LBB7_1
 ; FAST-NEXT:  # %bb.2: # %for.end
 ; FAST-NEXT:    retl
 entry:
@@ -186,3 +265,20 @@
 for.end:
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/fold-load-unops.ll b/llvm/test/CodeGen/X86/fold-load-unops.ll
--- a/llvm/test/CodeGen/X86/fold-load-unops.ll
+++ b/llvm/test/CodeGen/X86/fold-load-unops.ll
@@ -113,6 +113,38 @@
     ret <4 x float> %res
 }
 
+define float @rcpss_pgso(float* %a) !prof !14 {
+; SSE-LABEL: rcpss_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rcpss_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define <4 x float> @rcpss_full_pgso(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: rcpss_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rcpss_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
 define float @rsqrtss_size(float* %a) optsize {
 ; SSE-LABEL: rsqrtss_size:
 ; SSE:       # %bb.0:
@@ -145,6 +177,38 @@
     ret <4 x float> %res
 }
 
+define float @rsqrtss_pgso(float* %a) !prof !14 {
+; SSE-LABEL: rsqrtss_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rsqrtss_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define <4 x float> @rsqrtss_full_pgso(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: rsqrtss_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rsqrtss_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
 define float @sqrtss_size(float* %a) optsize{
 ; SSE-LABEL: sqrtss_size:
 ; SSE:       # %bb.0:
@@ -196,6 +260,57 @@
     ret <4 x float> %res
 }
 
+define float @sqrtss_pgso(float* %a) !prof !14 {
+; SSE-LABEL: sqrtss_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtss_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define <4 x float> @sqrtss_full_pgso(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: sqrtss_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtss_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
+define <4 x float> @sqrtss_full_pgso_volatile(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: sqrtss_full_pgso_volatile:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtss_full_pgso_volatile:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load volatile <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
 define double @sqrtsd_size(double* %a) optsize {
 ; SSE-LABEL: sqrtsd_size:
 ; SSE:       # %bb.0:
@@ -247,7 +362,75 @@
     ret <2 x double> %res
 }
 
+define double @sqrtsd_pgso(double* %a) !prof !14 {
+; SSE-LABEL: sqrtsd_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtsd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtsd_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load double, double* %a
+    %ins = insertelement <2 x double> undef, double %ld, i32 0
+    %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins)
+    %ext = extractelement <2 x double> %res, i32 0
+    ret double %ext
+}
+
+define <2 x double> @sqrtsd_full_pgso(<2 x double>* %a) !prof !14 {
+; SSE-LABEL: sqrtsd_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd (%rdi), %xmm0
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtsd_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovapd (%rdi), %xmm0
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <2 x double>, <2 x double>* %a
+    %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
+    ret <2 x double> %res
+}
+
+define <2 x double> @sqrtsd_full_pgso_volatile(<2 x double>* %a) !prof !14 {
+; SSE-LABEL: sqrtsd_full_pgso_volatile:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd (%rdi), %xmm0
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtsd_full_pgso_volatile:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovapd (%rdi), %xmm0
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load volatile <2 x double>, <2 x double>* %a
+    %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
+    ret <2 x double> %res
+}
+
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -196,6 +196,26 @@
   ret i32 %tmp
 }
 
+define i32 @var_shift_i32_pgso(i32 %x, i32 %y, i32 %z) nounwind !prof !14 {
+; X86-LABEL: var_shift_i32_pgso:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: var_shift_i32_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shldl %cl, %esi, %eax
+; X64-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %tmp
+}
+
 define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i64:
 ; X86-FAST:       # %bb.0:
@@ -216,36 +236,36 @@
 ; X86-FAST-NEXT:    shll %cl, %edi
 ; X86-FAST-NEXT:    shldl %cl, %eax, %ebp
 ; X86-FAST-NEXT:    testb $32, %bl
-; X86-FAST-NEXT:    je .LBB4_2
+; X86-FAST-NEXT:    je .LBB5_2
 ; X86-FAST-NEXT:  # %bb.1:
 ; X86-FAST-NEXT:    movl %edi, %ebp
 ; X86-FAST-NEXT:    xorl %edi, %edi
-; X86-FAST-NEXT:  .LBB4_2:
+; X86-FAST-NEXT:  .LBB5_2:
 ; X86-FAST-NEXT:    movb $64, %cl
 ; X86-FAST-NEXT:    subb %bl, %cl
 ; X86-FAST-NEXT:    movl %edx, %esi
 ; X86-FAST-NEXT:    shrl %cl, %esi
 ; X86-FAST-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-FAST-NEXT:    testb $32, %cl
-; X86-FAST-NEXT:    jne .LBB4_3
+; X86-FAST-NEXT:    jne .LBB5_3
 ; X86-FAST-NEXT:  # %bb.4:
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-FAST-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-FAST-NEXT:    testl %ebx, %ebx
-; X86-FAST-NEXT:    jne .LBB4_6
-; X86-FAST-NEXT:    jmp .LBB4_7
-; X86-FAST-NEXT:  .LBB4_3:
+; X86-FAST-NEXT:    jne .LBB5_6
+; X86-FAST-NEXT:    jmp .LBB5_7
+; X86-FAST-NEXT:  .LBB5_3:
 ; X86-FAST-NEXT:    movl %esi, %ecx
 ; X86-FAST-NEXT:    xorl %esi, %esi
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-FAST-NEXT:    testl %ebx, %ebx
-; X86-FAST-NEXT:    je .LBB4_7
-; X86-FAST-NEXT:  .LBB4_6:
+; X86-FAST-NEXT:    je .LBB5_7
+; X86-FAST-NEXT:  .LBB5_6:
 ; X86-FAST-NEXT:    orl %esi, %ebp
 ; X86-FAST-NEXT:    orl %ecx, %edi
 ; X86-FAST-NEXT:    movl %edi, %eax
 ; X86-FAST-NEXT:    movl %ebp, %edx
-; X86-FAST-NEXT:  .LBB4_7:
+; X86-FAST-NEXT:  .LBB5_7:
 ; X86-FAST-NEXT:    addl $4, %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
@@ -279,11 +299,11 @@
 ; X86-SLOW-NEXT:    testb %dl, %dl
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    je .LBB4_2
+; X86-SLOW-NEXT:    je .LBB5_2
 ; X86-SLOW-NEXT:  # %bb.1:
 ; X86-SLOW-NEXT:    orl %eax, %ebp
 ; X86-SLOW-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:  .LBB4_2:
+; X86-SLOW-NEXT:  .LBB5_2:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-SLOW-NEXT:    movl %ebp, %eax
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
@@ -294,41 +314,41 @@
 ; X86-SLOW-NEXT:    negb %cl
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    testb %ch, %ch
-; X86-SLOW-NEXT:    je .LBB4_4
+; X86-SLOW-NEXT:    je .LBB5_4
 ; X86-SLOW-NEXT:  # %bb.3:
 ; X86-SLOW-NEXT:    orl %edi, %eax
 ; X86-SLOW-NEXT:    movl %eax, %ebp
-; X86-SLOW-NEXT:  .LBB4_4:
+; X86-SLOW-NEXT:  .LBB5_4:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movl %eax, %edi
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %edi
 ; X86-SLOW-NEXT:    testb $32, %bl
-; X86-SLOW-NEXT:    je .LBB4_6
+; X86-SLOW-NEXT:    je .LBB5_6
 ; X86-SLOW-NEXT:  # %bb.5:
 ; X86-SLOW-NEXT:    movl %edi, %ebp
 ; X86-SLOW-NEXT:    xorl %edi, %edi
-; X86-SLOW-NEXT:  .LBB4_6:
+; X86-SLOW-NEXT:  .LBB5_6:
 ; X86-SLOW-NEXT:    movb %dh, %cl
 ; X86-SLOW-NEXT:    shrl %cl, %esi
 ; X86-SLOW-NEXT:    testb $32, %dh
-; X86-SLOW-NEXT:    jne .LBB4_7
+; X86-SLOW-NEXT:    jne .LBB5_7
 ; X86-SLOW-NEXT:  # %bb.8:
 ; X86-SLOW-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    jne .LBB4_10
-; X86-SLOW-NEXT:    jmp .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_7:
+; X86-SLOW-NEXT:    jne .LBB5_10
+; X86-SLOW-NEXT:    jmp .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_7:
 ; X86-SLOW-NEXT:    movl %esi, %ecx
 ; X86-SLOW-NEXT:    xorl %esi, %esi
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    je .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_10:
+; X86-SLOW-NEXT:    je .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_10:
 ; X86-SLOW-NEXT:    orl %esi, %ebp
 ; X86-SLOW-NEXT:    orl %ecx, %edi
 ; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl %edi, %eax
-; X86-SLOW-NEXT:  .LBB4_11:
+; X86-SLOW-NEXT:  .LBB5_11:
 ; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-SLOW-NEXT:    addl $8, %esp
 ; X86-SLOW-NEXT:    popl %esi
@@ -503,3 +523,20 @@
   %tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 7)
   ret i64 %tmp
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -195,6 +195,26 @@
   ret i32 %tmp
 }
 
+define i32 @var_shift_i32_pgso(i32 %x, i32 %y, i32 %z) nounwind !prof !14 {
+; X86-LABEL: var_shift_i32_pgso:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: var_shift_i32_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdl %cl, %edi, %eax
+; X64-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %tmp
+}
+
 define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i64:
 ; X86-FAST:       # %bb.0:
@@ -216,30 +236,30 @@
 ; X86-FAST-NEXT:    shll %cl, %edi
 ; X86-FAST-NEXT:    shldl %cl, %eax, %esi
 ; X86-FAST-NEXT:    testb $32, %cl
-; X86-FAST-NEXT:    je .LBB4_2
+; X86-FAST-NEXT:    je .LBB5_2
 ; X86-FAST-NEXT:  # %bb.1:
 ; X86-FAST-NEXT:    movl %edi, %esi
 ; X86-FAST-NEXT:    xorl %edi, %edi
-; X86-FAST-NEXT:  .LBB4_2:
+; X86-FAST-NEXT:  .LBB5_2:
 ; X86-FAST-NEXT:    movl %edx, %ebp
 ; X86-FAST-NEXT:    movl %ebx, %ecx
 ; X86-FAST-NEXT:    shrl %cl, %ebp
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-FAST-NEXT:    shrdl %cl, %edx, %eax
 ; X86-FAST-NEXT:    testb $32, %bl
-; X86-FAST-NEXT:    je .LBB4_4
+; X86-FAST-NEXT:    je .LBB5_4
 ; X86-FAST-NEXT:  # %bb.3:
 ; X86-FAST-NEXT:    movl %ebp, %eax
 ; X86-FAST-NEXT:    xorl %ebp, %ebp
-; X86-FAST-NEXT:  .LBB4_4:
+; X86-FAST-NEXT:  .LBB5_4:
 ; X86-FAST-NEXT:    testl %ebx, %ebx
-; X86-FAST-NEXT:    je .LBB4_6
+; X86-FAST-NEXT:    je .LBB5_6
 ; X86-FAST-NEXT:  # %bb.5:
 ; X86-FAST-NEXT:    orl %ebp, %esi
 ; X86-FAST-NEXT:    orl %eax, %edi
 ; X86-FAST-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-FAST-NEXT:    movl %esi, %edx
-; X86-FAST-NEXT:  .LBB4_6:
+; X86-FAST-NEXT:  .LBB5_6:
 ; X86-FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-FAST-NEXT:    addl $4, %esp
 ; X86-FAST-NEXT:    popl %esi
@@ -274,11 +294,11 @@
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    testb %ch, %ch
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    je .LBB4_2
+; X86-SLOW-NEXT:    je .LBB5_2
 ; X86-SLOW-NEXT:  # %bb.1:
 ; X86-SLOW-NEXT:    orl %edi, %edx
 ; X86-SLOW-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:  .LBB4_2:
+; X86-SLOW-NEXT:  .LBB5_2:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    movl %ecx, %edx
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
@@ -291,41 +311,41 @@
 ; X86-SLOW-NEXT:    shll %cl, %edi
 ; X86-SLOW-NEXT:    testb %ah, %ah
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    je .LBB4_4
+; X86-SLOW-NEXT:    je .LBB5_4
 ; X86-SLOW-NEXT:  # %bb.3:
 ; X86-SLOW-NEXT:    orl %edx, %edi
 ; X86-SLOW-NEXT:    movl %edi, %ebp
-; X86-SLOW-NEXT:  .LBB4_4:
+; X86-SLOW-NEXT:  .LBB5_4:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    testb $32, %bl
-; X86-SLOW-NEXT:    je .LBB4_6
+; X86-SLOW-NEXT:    je .LBB5_6
 ; X86-SLOW-NEXT:  # %bb.5:
 ; X86-SLOW-NEXT:    movl %edi, %ebp
 ; X86-SLOW-NEXT:    xorl %edi, %edi
-; X86-SLOW-NEXT:  .LBB4_6:
+; X86-SLOW-NEXT:  .LBB5_6:
 ; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    testb $32, %al
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    jne .LBB4_7
+; X86-SLOW-NEXT:    jne .LBB5_7
 ; X86-SLOW-NEXT:  # %bb.8:
 ; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    jne .LBB4_10
-; X86-SLOW-NEXT:    jmp .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_7:
+; X86-SLOW-NEXT:    jne .LBB5_10
+; X86-SLOW-NEXT:    jmp .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_7:
 ; X86-SLOW-NEXT:    movl %esi, %eax
 ; X86-SLOW-NEXT:    xorl %esi, %esi
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    je .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_10:
+; X86-SLOW-NEXT:    je .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_10:
 ; X86-SLOW-NEXT:    orl %ebp, %esi
 ; X86-SLOW-NEXT:    orl %edi, %eax
 ; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl %eax, %edx
-; X86-SLOW-NEXT:  .LBB4_11:
+; X86-SLOW-NEXT:  .LBB5_11:
 ; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    addl $8, %esp
 ; X86-SLOW-NEXT:    popl %esi
@@ -499,3 +519,20 @@
   %tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 7)
   ret i64 %tmp
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/CodeGen/X86/haddsub.ll
@@ -1983,6 +1983,80 @@
   ret float %x230
 }
 
+define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 {
+; SSE3-LABEL: hadd32_4_pgso:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm1
+; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-NEXT:    addps %xmm0, %xmm1
+; SSE3-NEXT:    haddps %xmm1, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; AVX-LABEL: hadd32_4_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %x227 = fadd <4 x float> %x225, %x226
+  %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %x229 = fadd <4 x float> %x227, %x228
+  %x230 = extractelement <4 x float> %x229, i32 0
+  ret float %x230
+}
+
+define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 {
+; SSE3-LABEL: hadd32_8_pgso:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm1
+; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-NEXT:    addps %xmm0, %xmm1
+; SSE3-NEXT:    haddps %xmm1, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; AVX-LABEL: hadd32_8_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x227 = fadd <8 x float> %x225, %x226
+  %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x229 = fadd <8 x float> %x227, %x228
+  %x230 = extractelement <8 x float> %x229, i32 0
+  ret float %x230
+}
+
+define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 {
+; SSE3-LABEL: hadd32_16_pgso:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm1
+; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-NEXT:    addps %xmm0, %xmm1
+; SSE3-NEXT:    haddps %xmm1, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; AVX-LABEL: hadd32_16_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x227 = fadd <16 x float> %x225, %x226
+  %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x229 = fadd <16 x float> %x227, %x228
+  %x230 = extractelement <16 x float> %x229, i32 0
+  ret float %x230
+}
+
 define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32:
 ; SSE3-SLOW:       # %bb.0:
@@ -2115,3 +2189,20 @@
   %r = extractelement <16 x float> %x0123, i32 0
   ret float %r
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/immediate_merging.ll b/llvm/test/CodeGen/X86/immediate_merging.ll
--- a/llvm/test/CodeGen/X86/immediate_merging.ll
+++ b/llvm/test/CodeGen/X86/immediate_merging.ll
@@ -73,6 +73,68 @@
   ret i32 0
 }
 
+; Test PGSO to make sure immediates with multiple users don't get pulled in to
+; instructions.
+define i32 @foo_pgso() !prof !14 {
+; X86-LABEL: foo_pgso:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $1234, %eax # imm = 0x4D2
+; X86-NEXT:    movl %eax, a
+; X86-NEXT:    movl %eax, b
+; X86-NEXT:    movl $12, %eax
+; X86-NEXT:    movl %eax, c
+; X86-NEXT:    cmpl %eax, e
+; X86-NEXT:    jne .LBB1_2
+; X86-NEXT:  # %bb.1: # %if.then
+; X86-NEXT:    movl $1, x
+; X86-NEXT:  .LBB1_2: # %if.end
+; X86-NEXT:    movl $1234, f # imm = 0x4D2
+; X86-NEXT:    movl $555, %eax # imm = 0x22B
+; X86-NEXT:    movl %eax, h
+; X86-NEXT:    addl %eax, i
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: foo_pgso:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl $1234, %eax # imm = 0x4D2
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    movl $12, %eax
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    cmpl %eax, {{.*}}(%rip)
+; X64-NEXT:    jne .LBB1_2
+; X64-NEXT:  # %bb.1: # %if.then
+; X64-NEXT:    movl $1, {{.*}}(%rip)
+; X64-NEXT:  .LBB1_2: # %if.end
+; X64-NEXT:    movl $1234, {{.*}}(%rip) # imm = 0x4D2
+; X64-NEXT:    movl $555, %eax # imm = 0x22B
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    addl %eax, {{.*}}(%rip)
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+entry:
+  store i32 1234, i32* @a
+  store i32 1234, i32* @b
+  store i32 12, i32* @c
+  %0 = load i32, i32* @e
+  %cmp = icmp eq i32 %0, 12
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @x
+  br label %if.end
+
+; New block.. Make sure 1234 isn't live across basic blocks from before.
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1234, i32* @f
+  store i32 555, i32* @h
+  %1 = load i32, i32* @i
+  %add1 = add nsw i32 %1, 555
+  store i32 %add1, i32* @i
+  ret i32 0
+}
+
 ; Test -O2 to make sure that all immediates get pulled in to their users.
 define i32 @foo2() {
 ; X86-LABEL: foo2:
@@ -124,3 +186,47 @@
   call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i1 false)
   ret void
 }
+
+; memset gets lowered in DAG. Constant merging should hoist all the
+; immediates used to store to the individual memory locations. Make
+; sure we don't directly store the immediates.
+define void @foomemset_pgso() !prof !14 {
+; X86-LABEL: foomemset_pgso:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $555819297, %eax # imm = 0x21212121
+; X86-NEXT:    movl %eax, AA+20
+; X86-NEXT:    movl %eax, AA+16
+; X86-NEXT:    movl %eax, AA+12
+; X86-NEXT:    movl %eax, AA+8
+; X86-NEXT:    movl %eax, AA+4
+; X86-NEXT:    movl %eax, AA
+; X86-NEXT:    retl
+;
+; X64-LABEL: foomemset_pgso:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $2387225703656530209, %rax # imm = 0x2121212121212121
+; X64-NEXT:    movq %rax, AA+{{.*}}(%rip)
+; X64-NEXT:    movq %rax, AA+{{.*}}(%rip)
+; X64-NEXT:    movq %rax, {{.*}}(%rip)
+; X64-NEXT:    retq
+entry:
+  call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i1 false)
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/immediate_merging64.ll b/llvm/test/CodeGen/X86/immediate_merging64.ll
--- a/llvm/test/CodeGen/X86/immediate_merging64.ll
+++ b/llvm/test/CodeGen/X86/immediate_merging64.ll
@@ -19,6 +19,19 @@
   ret i1 %cmp
 }
 
+define i1 @imm_multiple_users_pgso(i64 %a, i64* %b) !prof !14 {
+; CHECK-LABEL: imm_multiple_users_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq $-1, %rax
+; CHECK-NEXT:    movq %rax, (%rsi)
+; CHECK-NEXT:    cmpq %rax, %rdi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  store i64 -1, i64* %b, align 8
+  %cmp = icmp eq i64 %a, -1
+  ret i1 %cmp
+}
+
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
 
 ; Inlined memsets requiring multiple same-sized stores should be lowered using
@@ -34,3 +47,31 @@
   tail call void @llvm.memset.p0i8.i64(i8* %D, i8 0, i64 15, i1 false)
   ret void
 }
+
+define void @memset_zero_pgso(i8* noalias nocapture %D) !prof !14 {
+; CHECK-LABEL: memset_zero_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movq %rax, 7(%rdi)
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    retq
+  tail call void @llvm.memset.p0i8.i64(i8* %D, i8 0, i64 15, i1 false)
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/loop-blocks.ll b/llvm/test/CodeGen/X86/loop-blocks.ll
--- a/llvm/test/CodeGen/X86/loop-blocks.ll
+++ b/llvm/test/CodeGen/X86/loop-blocks.ll
@@ -269,6 +269,35 @@
 
 attributes #0 = { minsize norecurse nounwind optsize readnone uwtable }
 
+; CHECK-LABEL: slightly_more_involved_2_pgso:
+; CHECK-NOT:      jmp .LBB6_1
+; CHECK:          .LBB6_1:
+; CHECK-NEXT:     callq body
+
+define void @slightly_more_involved_2_pgso() norecurse nounwind readnone uwtable !prof !14 {
+entry:
+  br label %loop
+
+loop:
+  call void @body()
+  %t0 = call i32 @get()
+  %t1 = icmp slt i32 %t0, 2
+  br i1 %t1, label %block_a, label %bb
+
+bb:
+  %t2 = call i32 @get()
+  %t3 = icmp slt i32 %t2, 99
+  br i1 %t3, label %exit, label %loop
+
+block_a:
+  call void @bar99()
+  br label %loop
+
+exit:
+  call void @exit()
+  ret void
+}
+
 declare void @bar99() nounwind
 declare void @bar100() nounwind
 declare void @bar101() nounwind
@@ -281,3 +310,20 @@
 declare void @block_a_true_func() nounwind
 declare void @block_a_false_func() nounwind
 declare void @block_a_merge_func() nounwind
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/materialize.ll b/llvm/test/CodeGen/X86/materialize.ll
--- a/llvm/test/CodeGen/X86/materialize.ll
+++ b/llvm/test/CodeGen/X86/materialize.ll
@@ -30,6 +30,21 @@
 ; CHECK64-NEXT:  retq
 }
 
+define i32 @one32_pgso() !prof !14 {
+entry:
+  ret i32 1
+
+; CHECK32-LABEL: one32_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  retl
+
+; FIXME: Figure out the best approach in 64-bit mode.
+; CHECK64-LABEL: one32_pgso:
+; CHECK64:       movl $1, %eax
+; CHECK64-NEXT:  retq
+}
+
 define i32 @one32_minsize() minsize {
 entry:
   ret i32 1
@@ -107,6 +122,16 @@
 ; CHECK32-NEXT:  retl
 }
 
+define i32 @minus_one32_pgso() !prof !14 {
+entry:
+  ret i32 -1
+
+; CHECK32-LABEL: minus_one32_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  retl
+}
+
 define i32 @minus_one32_minsize() minsize {
 entry:
   ret i32 -1
@@ -140,6 +165,28 @@
 ; CHECK32-NEXT:  retl
 }
 
+define i16 @one16_pgso() !prof !14 {
+entry:
+  ret i16 1
+
+; CHECK32-LABEL: one16_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  # kill
+; CHECK32-NEXT:  retl
+}
+
+define i16 @minus_one16_pgso() !prof !14 {
+entry:
+  ret i16 -1
+
+; CHECK32-LABEL: minus_one16_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  # kill
+; CHECK32-NEXT:  retl
+}
+
 define i32 @minus_five32() minsize {
 entry:
   ret i32 -5
@@ -213,4 +260,72 @@
 ; CHECK32:       retl
 }
 
+define i32 @rematerialize_minus_one_pgso() !prof !14 {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; -1 should be re-materialized here instead of getting spilled above.
+  ret i32 -1
+
+; CHECK32-LABEL: rematerialize_minus_one_pgso
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NOT:   %eax
+; CHECK32:       retl
+}
+
+define i32 @rematerialize_minus_one_eflags_pgso(i32 %x) !prof !14 {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; Define eflags.
+  %a = icmp ne i32 %x, 123
+  %b = zext i1 %a to i32
+  ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
+  ; It must therefore not use the xor-dec lowering.
+  %c = select i1 %a, i32 %b, i32 -1
+  ret i32 %c
+
+; CHECK32-LABEL: rematerialize_minus_one_eflags_pgso
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       cmpl
+; CHECK32:       setne
+; CHECK32-NOT:   xorl
+; CHECK32:       movl $-1
+; CHECK32:       cmov
+; CHECK32:       retl
+}
+
 declare x86_thiscallcc void @f(i32)
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -0,0 +1,1029 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare i32 @memcmp(i8*, i8*, i64)
+declare i32 @bcmp(i8*, i8*, i64)
+
+define i32 @length2(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length2_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $2
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_nobuiltin_attr:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length3:
+; X86:       # %bb.0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    rolw $8, %si
+; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_1: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB4_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3:
+; X64:       # %bb.0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    cmpw %cx, %ax
+; X64-NEXT:    jne .LBB4_1
+; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    movzbl 2(%rdi), %eax
+; X64-NEXT:    movzbl 2(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_1: # %res_block
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length3_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %edx
+; X86-NEXT:    xorw (%eax), %dx
+; X86-NEXT:    movb 2(%ecx), %cl
+; X86-NEXT:    xorb 2(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orw %dx, %ax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    xorw (%rsi), %ax
+; X64-NEXT:    movb 2(%rdi), %cl
+; X64-NEXT:    xorb 2(%rsi), %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    orw %ax, %cx
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %ecx
+; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length4_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length4_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq_const:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length5:
+; X86:       # %bb.0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB9_1
+; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB9_3
+; X86-NEXT:  .LBB9_1: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB9_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5:
+; X64:       # %bb.0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    jne .LBB9_1
+; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_1: # %res_block
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length5_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    movb 4(%ecx), %cl
+; X86-NEXT:    xorb 4(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    xorl (%rsi), %eax
+; X64-NEXT:    movb 4(%rdi), %cl
+; X64-NEXT:    xorb 4(%rsi), %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB11_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB11_3
+; X86-NEXT:  .LBB11_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB11_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length8_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length8_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
+; X86-NEXT:    xorl (%eax), %ecx
+; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
+; X86-NEXT:    xorl 4(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length12_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    xorq (%rsi), %rax
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    xorl 8(%rsi), %ecx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length12:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB15_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    movl 8(%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    je .LBB15_3
+; X64-NEXT:  .LBB15_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB15_3: # %endblock
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length16:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB16_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    je .LBB16_3
+; X64-NEXT:  .LBB16_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB16_3: # %endblock
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length16_eq:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length16_eq_const:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length24:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length24:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $24, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length24_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length24_eq:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-AVX2-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length24_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length24_eq_const:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $32, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq_const:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-LABEL: length64_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length64_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length64_eq:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
+; X64-AVX2-NEXT:    vpcmpeqb 32(%rsi), %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length64_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length64_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length64_eq_const:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: bcmp_length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: bcmp_length2:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @bcmp(i8* %X, i8* %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll
--- a/llvm/test/CodeGen/X86/memcpy.ll
+++ b/llvm/test/CodeGen/X86/memcpy.ll
@@ -139,6 +139,36 @@
   ret void
 }
 
+define void @test3_pgso(i8* nocapture %A, i8* nocapture %B) nounwind noredzone !prof !14 {
+; LINUX-LABEL: test3_pgso:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    movl $64, %edx
+; LINUX-NEXT:    jmp memcpy # TAILCALL
+;
+; DARWIN-LABEL: test3_pgso:
+; DARWIN:       ## %bb.0: ## %entry
+; DARWIN-NEXT:    movq 56(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 56(%rdi)
+; DARWIN-NEXT:    movq 48(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 48(%rdi)
+; DARWIN-NEXT:    movq 40(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 40(%rdi)
+; DARWIN-NEXT:    movq 32(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 32(%rdi)
+; DARWIN-NEXT:    movq 24(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 24(%rdi)
+; DARWIN-NEXT:    movq 16(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 16(%rdi)
+; DARWIN-NEXT:    movq (%rsi), %rax
+; DARWIN-NEXT:    movq 8(%rsi), %rcx
+; DARWIN-NEXT:    movq %rcx, 8(%rdi)
+; DARWIN-NEXT:    movq %rax, (%rdi)
+; DARWIN-NEXT:    retq
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false)
+  ret void
+}
+
 define void @test3_minsize(i8* nocapture %A, i8* nocapture %B) nounwind minsize noredzone {
 ; DARWIN-LABEL: test3_minsize:
 ; DARWIN:       ## %bb.0:
@@ -506,3 +536,20 @@
   tail call void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* align 8 %a, i8 addrspace(256)* align 8 %b, i64 16, i1 false)
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/powi.ll b/llvm/test/CodeGen/X86/powi.ll
--- a/llvm/test/CodeGen/X86/powi.ll
+++ b/llvm/test/CodeGen/X86/powi.ll
@@ -86,6 +86,39 @@
   ret double %ret
 }
 
+define double @pow_wrapper_pgso(double %a) !prof !14 {
+; X86-X87-LABEL: pow_wrapper_pgso:
+; X86-X87:       # %bb.0:
+; X86-X87-NEXT:    subl $12, %esp
+; X86-X87-NEXT:    .cfi_def_cfa_offset 16
+; X86-X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-X87-NEXT:    fstpl (%esp)
+; X86-X87-NEXT:    movl $15, {{[0-9]+}}(%esp)
+; X86-X87-NEXT:    calll __powidf2
+; X86-X87-NEXT:    addl $12, %esp
+; X86-X87-NEXT:    .cfi_def_cfa_offset 4
+; X86-X87-NEXT:    retl
+;
+; X86-SSE-LABEL: pow_wrapper_pgso:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE-NEXT:    movl $15, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    calll __powidf2
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE-NEXT:    retl
+;
+; X64-LABEL: pow_wrapper_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $15, %edi
+; X64-NEXT:    jmp __powidf2 # TAILCALL
+  %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
+  ret double %ret
+}
+
 define double @pow_wrapper_minsize(double %a) minsize {
 ; X86-X87-LABEL: pow_wrapper_minsize:
 ; X86-X87:       # %bb.0:
@@ -124,3 +157,19 @@
 
 declare double @llvm.powi.f64(double, i32) nounwind readonly
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/rounding-ops.ll b/llvm/test/CodeGen/X86/rounding-ops.ll
--- a/llvm/test/CodeGen/X86/rounding-ops.ll
+++ b/llvm/test/CodeGen/X86/rounding-ops.ll
@@ -252,3 +252,60 @@
   %call = tail call double @trunc(double %x) nounwind readnone
   ret double %call
 }
+
+define float @test11_pgso(float* %xptr) nounwind !prof !14 {
+; CHECK-SSE-LABEL: test11_pgso:
+; CHECK-SSE:       ## %bb.0:
+; CHECK-SSE-NEXT:    roundss $11, (%rdi), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX-LABEL: test11_pgso:
+; CHECK-AVX:       ## %bb.0:
+; CHECK-AVX-NEXT:    vroundss $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: test11_pgso:
+; CHECK-AVX512:       ## %bb.0:
+; CHECK-AVX512-NEXT:    vroundss $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    retq
+  %x = load float, float* %xptr
+  %call = tail call float @truncf(float %x) nounwind readnone
+  ret float %call
+}
+
+define double @test12_pgso(double* %xptr) nounwind !prof !14 {
+; CHECK-SSE-LABEL: test12_pgso:
+; CHECK-SSE:       ## %bb.0:
+; CHECK-SSE-NEXT:    roundsd $11, (%rdi), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX-LABEL: test12_pgso:
+; CHECK-AVX:       ## %bb.0:
+; CHECK-AVX-NEXT:    vroundsd $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: test12_pgso:
+; CHECK-AVX512:       ## %bb.0:
+; CHECK-AVX512-NEXT:    vroundsd $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    retq
+  %x = load double, double* %xptr
+  %call = tail call double @trunc(double %x) nounwind readnone
+  ret double %call
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/shrink-compare-pgso.ll b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+
+declare void @bar()
+
+define void @test1(i32* nocapture %X) nounwind !prof !14 {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $47, (%rdi)
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %tmp1 = load i32, i32* %X, align 4
+  %and = and i32 %tmp1, 255
+  %cmp = icmp eq i32 %and, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test2(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $47, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and i32 %X, 255
+  %cmp = icmp eq i32 %and, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test3(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and i32 %X, 255
+  %cmp = icmp eq i32 %and, 255
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; PR16083
+define i1 @test4(i64 %a, i32 %b) {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    je .LBB3_1
+; CHECK-NEXT:  # %bb.2: # %lor.end
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB3_1: # %lor.rhs
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+entry:
+  %tobool = icmp ne i32 %b, 0
+  br i1 %tobool, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %entry
+  %and = and i64 0, %a
+  %tobool1 = icmp ne i64 %and, 0
+  br label %lor.end
+
+lor.end:                                          ; preds = %lor.rhs, %entry
+  %p = phi i1 [ true, %entry ], [ %tobool1, %lor.rhs ]
+  ret i1 %p
+}
+
+@x = global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 1 }, align 4
+
+; PR16551
+define void @test5(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl x+{{.*}}(%rip), %eax
+; CHECK-NEXT:    shll $16, %eax
+; CHECK-NEXT:    movzwl x+{{.*}}(%rip), %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    cmpl $1, %ecx
+; CHECK-NEXT:    jne bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %bf.load = load i56, i56* bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @x to i56*), align 4
+  %bf.lshr = lshr i56 %bf.load, 32
+  %bf.cast = trunc i56 %bf.lshr to i32
+  %cmp = icmp ne i32 %bf.cast, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test2_1(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    cmpl $256, %eax # imm = 0x100
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and i32 %X, 255
+  %cmp = icmp eq i32 %and, 256
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_1(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_47(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_47:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $47, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_127(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_127:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $127, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 127
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg1(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg2(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-2, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg127(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg127:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-127, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -127
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg128(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-128, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -128
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_255(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_255:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 255
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/slow-incdec.ll b/llvm/test/CodeGen/X86/slow-incdec.ll
--- a/llvm/test/CodeGen/X86/slow-incdec.ll
+++ b/llvm/test/CodeGen/X86/slow-incdec.ll
@@ -54,6 +54,26 @@
   ret i32 %r
 }
 
+define i32 @inc_pgso(i32 %x) !prof !14 {
+; CHECK-LABEL: inc_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    incl %eax
+; CHECK-NEXT:    retl
+  %r = add i32 %x, 1
+  ret i32 %r
+}
+
+define i32 @dec_pgso(i32 %x) !prof !14 {
+; CHECK-LABEL: dec_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    decl %eax
+; CHECK-NEXT:    retl
+  %r = add i32 %x, -1
+  ret i32 %r
+}
+
 declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
 declare void @other(i32* ) nounwind;
 
@@ -62,20 +82,20 @@
 ; INCDEC:       # %bb.0: # %entry
 ; INCDEC-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; INCDEC-NEXT:    incl (%eax)
-; INCDEC-NEXT:    jne .LBB4_1
+; INCDEC-NEXT:    jne .LBB6_1
 ; INCDEC-NEXT:  # %bb.2: # %if.end4
 ; INCDEC-NEXT:    jmp other # TAILCALL
-; INCDEC-NEXT:  .LBB4_1: # %return
+; INCDEC-NEXT:  .LBB6_1: # %return
 ; INCDEC-NEXT:    retl
 ;
 ; ADD-LABEL: cond_ae_to_cond_ne:
 ; ADD:       # %bb.0: # %entry
 ; ADD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ADD-NEXT:    addl $1, (%eax)
-; ADD-NEXT:    jne .LBB4_1
+; ADD-NEXT:    jne .LBB6_1
 ; ADD-NEXT:  # %bb.2: # %if.end4
 ; ADD-NEXT:    jmp other # TAILCALL
-; ADD-NEXT:  .LBB4_1: # %return
+; ADD-NEXT:  .LBB6_1: # %return
 ; ADD-NEXT:    retl
 entry:
   %t0 = load i32, i32* %p, align 8
@@ -109,10 +129,10 @@
 ; INCDEC-NEXT:    incb a
 ; INCDEC-NEXT:    sete d
 ; INCDEC-NEXT:    testb %al, %al
-; INCDEC-NEXT:    jne .LBB5_2
+; INCDEC-NEXT:    jne .LBB7_2
 ; INCDEC-NEXT:  # %bb.1: # %then
 ; INCDEC-NEXT:    jmp external_a # TAILCALL
-; INCDEC-NEXT:  .LBB5_2: # %else
+; INCDEC-NEXT:  .LBB7_2: # %else
 ; INCDEC-NEXT:    jmp external_b # TAILCALL
 ;
 ; ADD-LABEL: test_tail_call:
@@ -123,10 +143,10 @@
 ; ADD-NEXT:    addb $1, a
 ; ADD-NEXT:    sete d
 ; ADD-NEXT:    testb %al, %al
-; ADD-NEXT:    jne .LBB5_2
+; ADD-NEXT:    jne .LBB7_2
 ; ADD-NEXT:  # %bb.1: # %then
 ; ADD-NEXT:    jmp external_a # TAILCALL
-; ADD-NEXT:  .LBB5_2: # %else
+; ADD-NEXT:  .LBB7_2: # %else
 ; ADD-NEXT:    jmp external_b # TAILCALL
 entry:
   %val = load i32, i32* %ptr
@@ -152,3 +172,19 @@
   ret void
 }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll
--- a/llvm/test/CodeGen/X86/splat-for-size.ll
+++ b/llvm/test/CodeGen/X86/splat-for-size.ll
@@ -17,6 +17,17 @@
   ret <2 x double> %add
 }
 
+define <2 x double> @splat_v2f64_pgso(<2 x double> %x) !prof !14 {
+; CHECK-LABEL: splat_v2f64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; CHECK-NEXT:    # xmm1 = mem[0,0]
+; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %add = fadd <2 x double> %x, <double 1.0, double 1.0>
+  ret <2 x double> %add
+}
+
 define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
 ; CHECK-LABEL: splat_v4f64:
 ; CHECK:       # %bb.0:
@@ -27,6 +38,16 @@
   ret <4 x double> %add
 }
 
+define <4 x double> @splat_v4f64_pgso(<4 x double> %x) !prof !14 {
+; CHECK-LABEL: splat_v4f64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  ret <4 x double> %add
+}
+
 define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: splat_v4f32:
 ; CHECK:       # %bb.0:
@@ -37,6 +58,16 @@
   ret <4 x float> %add
 }
 
+define <4 x float> @splat_v4f32_pgso(<4 x float> %x) !prof !14 {
+; CHECK-LABEL: splat_v4f32_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <4 x float> %add
+}
+
 define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
 ; CHECK-LABEL: splat_v8f32:
 ; CHECK:       # %bb.0:
@@ -47,6 +78,16 @@
   ret <8 x float> %add
 }
 
+define <8 x float> @splat_v8f32_pgso(<8 x float> %x) !prof !14 {
+; CHECK-LABEL: splat_v8f32_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <8 x float> %add
+}
+
 ; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value.
 ; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq.
 define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 {
@@ -66,6 +107,23 @@
   ret <2 x i64> %add
 }
 
+define <2 x i64> @splat_v2i64_pgso(<2 x i64> %x) !prof !14 {
+; AVX-LABEL: splat_v2i64_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [2,2]
+; AVX-NEXT:    # xmm1 = mem[0,0]
+; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v2i64_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <2 x i64> %x, <i64 2, i64 2>
+  ret <2 x i64> %add
+}
+
 ; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors,
 ; and then we fake it: use vmovddup to splat 64-bit value.
 define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
@@ -88,6 +146,26 @@
   ret <4 x i64> %add
 }
 
+define <4 x i64> @splat_v4i64_pgso(<4 x i64> %x) !prof !14 {
+; AVX-LABEL: splat_v4i64_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [2,2]
+; AVX-NEXT:    # xmm2 = mem[0,0]
+; AVX-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v4i64_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2]
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <4 x i64> %x, <i64 2, i64 2, i64 2, i64 2>
+  ret <4 x i64> %add
+}
+
 ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
 define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 {
 ; AVX-LABEL: splat_v4i32:
@@ -105,6 +183,22 @@
   ret <4 x i32> %add
 }
 
+define <4 x i32> @splat_v4i32_pgso(<4 x i32> %x) !prof !14 {
+; AVX-LABEL: splat_v4i32_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2,2,2,2]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v4i32_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i32> %add
+}
+
 ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
 define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
 ; AVX-LABEL: splat_v8i32:
@@ -125,6 +219,25 @@
   ret <8 x i32> %add
 }
 
+define <8 x i32> @splat_v8i32_pgso(<8 x i32> %x) !prof !14 {
+; AVX-LABEL: splat_v8i32_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2,2,2,2]
+; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v8i32_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i32> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
 define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 {
 ; AVX-LABEL: splat_v8i16:
@@ -141,6 +254,21 @@
   ret <8 x i16> %add
 }
 
+define <8 x i16> @splat_v8i16_pgso(<8 x i16> %x) !prof !14 {
+; AVX-LABEL: splat_v8i16_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v8i16_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <8 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
 define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
 ; AVX-LABEL: splat_v16i16:
@@ -161,6 +289,25 @@
   ret <16 x i16> %add
 }
 
+define <16 x i16> @splat_v16i16_pgso(<16 x i16> %x) !prof !14 {
+; AVX-LABEL: splat_v16i16_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
+; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v16i16_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <16 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <16 x i16> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
 define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 {
 ; AVX-LABEL: splat_v16i8:
@@ -177,6 +324,21 @@
   ret <16 x i8> %add
 }
 
+define <16 x i8> @splat_v16i8_pgso(<16 x i8> %x) !prof !14 {
+; AVX-LABEL: splat_v16i8_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v16i8_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <16 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+  ret <16 x i8> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
 define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
 ; AVX-LABEL: splat_v32i8:
@@ -197,6 +359,25 @@
   ret <32 x i8> %add
 }
 
+define <32 x i8> @splat_v32i8_pgso(<32 x i8> %x) !prof !14 {
+; AVX-LABEL: splat_v32i8_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v32i8_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <32 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+  ret <32 x i8> %add
+}
+
 ; PR23259: Verify that ISel doesn't crash with a 'fatal error in backend'
 ; due to a missing AVX pattern to select a v2i64 X86ISD::BROADCAST of a
 ; loadi64 with multiple uses.
@@ -238,3 +419,20 @@
 
 attributes #0 = { optsize }
 attributes #1 = { minsize }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -822,6 +822,55 @@
   ret <2 x double> %res
 }
 
+define <2 x double> @test_x86_sse2_cvtss2sd_load_pgso(<2 x double> %a0, <4 x float>* %p1) !prof !14 {
+; X86-SSE-LABEL: test_x86_sse2_cvtss2sd_load_pgso:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    cvtss2sd (%eax), %xmm1 ## encoding: [0xf3,0x0f,0x5a,0x08]
+; X86-SSE-NEXT:    movsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x10,0xc1]
+; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX1-LABEL: test_x86_sse2_cvtss2sd_load_pgso:
+; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX1-NEXT:    vcvtss2sd (%eax), %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0x08]
+; X86-AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
+; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512-LABEL: test_x86_sse2_cvtss2sd_load_pgso:
+; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512-NEXT:    vcvtss2sd (%eax), %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0x08]
+; X86-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
+; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: test_x86_sse2_cvtss2sd_load_pgso:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    cvtss2sd (%rdi), %xmm1 ## encoding: [0xf3,0x0f,0x5a,0x0f]
+; X64-SSE-NEXT:    movsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x10,0xc1]
+; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX1-LABEL: test_x86_sse2_cvtss2sd_load_pgso:
+; X64-AVX1:       ## %bb.0:
+; X64-AVX1-NEXT:    vcvtss2sd (%rdi), %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0x0f]
+; X64-AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: test_x86_sse2_cvtss2sd_load_pgso:
+; X64-AVX512:       ## %bb.0:
+; X64-AVX512-NEXT:    vcvtss2sd (%rdi), %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0x0f]
+; X64-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
+; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
+  %a1 = load <4 x float>, <4 x float>* %p1
+  %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
 
 define <4 x float> @test_x86_sse2_cvtdq2ps(<4 x i32> %a0) {
 ; SSE-LABEL: test_x86_sse2_cvtdq2ps:
@@ -1042,3 +1091,20 @@
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -431,6 +431,52 @@
   ret <4 x float> %tmp1
 }
 
+define <4 x float> @insertps_or_blendps_pgso(<4 x float> %t1, float %t2) nounwind !prof !14 {
+; X86-SSE-LABEL: insertps_or_blendps_pgso:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
+; X86-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
+; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX1-LABEL: insertps_or_blendps_pgso:
+; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
+; X86-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
+; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512-LABEL: insertps_or_blendps_pgso:
+; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
+; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: insertps_or_blendps_pgso:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
+; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX1-LABEL: insertps_or_blendps_pgso:
+; X64-AVX1:       ## %bb.0:
+; X64-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
+; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: insertps_or_blendps_pgso:
+; X64-AVX512:       ## %bb.0:
+; X64-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
+; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
+  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
+  ret <4 x float> %tmp1
+}
+
 ; An insert into the low 32-bits of a vector from the low 32-bits of another vector
 ; is always just a blendps because blendps is never more expensive than insertps.
 define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
@@ -2179,3 +2225,20 @@
   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
   ret <4 x float> %vecinit1
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
--- a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
+++ b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
@@ -19,6 +19,23 @@
 
 }
 
+define void @zero_pgso(i32* %p) !prof !14 {
+; CHECK32-LABEL: zero_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl $0, (%eax)
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: zero_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movl $0, (%rdi)
+; CHECK64-NEXT:    retq
+entry:
+  store i32 0, i32* %p
+  ret void
+
+}
+
 define void @minus_one_optsize(i32* %p) optsize {
 ; CHECK32-LABEL: minus_one_optsize:
 ; CHECK32:       # %bb.0: # %entry
@@ -36,6 +53,22 @@
 
 }
 
+define void @minus_one_pgso(i32* %p) !prof !14 {
+; CHECK32-LABEL: minus_one_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl $-1, (%eax)
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: minus_one_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movl $-1, (%rdi)
+; CHECK64-NEXT:    retq
+entry:
+  store i32 -1, i32* %p
+  ret void
+
+}
 
 define void @zero_64(i64* %p) minsize {
 ; CHECK32-LABEL: zero_64:
@@ -244,3 +277,20 @@
   store volatile i16 -1, i16* %p
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/switch-density.ll b/llvm/test/CodeGen/X86/switch-density.ll
--- a/llvm/test/CodeGen/X86/switch-density.ll
+++ b/llvm/test/CodeGen/X86/switch-density.ll
@@ -79,3 +79,72 @@
 ; CHECK: ja
 ; CHECK: jmpq *.LJTI
 }
+
+define void @dense_optsize(i32 %x) optsize {
+entry:
+  switch i32 %x, label %return [
+    i32 12, label %bb0
+    i32 4,  label %bb1
+    i32 16, label %bb1
+    i32 20, label %bb2
+    i32 8,  label %bb3
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+bb3: tail call void @g(i32 2) br label %return
+return: ret void
+
+; Lowered as branches.
+; CHECK-LABEL: dense_optsize
+; CHECK: cmpl $11
+; CHECK: cmpl $20
+; CHECK: cmpl $16
+; CHECK: cmpl $12
+; CHECK: cmpl $4
+; CHECK: cmpl $8
+; CHECK: retq
+}
+
+define void @dense_pgso(i32 %x) !prof !14 {
+entry:
+  switch i32 %x, label %return [
+    i32 12, label %bb0
+    i32 4,  label %bb1
+    i32 16, label %bb1
+    i32 20, label %bb2
+    i32 8,  label %bb3
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+bb3: tail call void @g(i32 2) br label %return
+return: ret void
+
+; Lowered as branches.
+; CHECK-LABEL: dense_pgso
+; CHECK: cmpl $11
+; CHECK: cmpl $20
+; CHECK: cmpl $16
+; CHECK: cmpl $12
+; CHECK: cmpl $4
+; CHECK: cmpl $8
+; CHECK: retq
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/tail-opts.ll b/llvm/test/CodeGen/X86/tail-opts.ll
--- a/llvm/test/CodeGen/X86/tail-opts.ll
+++ b/llvm/test/CodeGen/X86/tail-opts.ll
@@ -480,6 +480,47 @@
   ret void
 }
 
+define void @one_pgso(i32 %v) nounwind !prof !14 {
+; CHECK-LABEL: one_pgso:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB6_3
+; CHECK-NEXT:  # %bb.1: # %bby
+; CHECK-NEXT:    cmpl $16, %edi
+; CHECK-NEXT:    je .LBB6_4
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    jmp tail_call_me # TAILCALL
+; CHECK-NEXT:  .LBB6_3: # %bbx
+; CHECK-NEXT:    cmpl $128, %edi
+; CHECK-NEXT:    jne tail_call_me # TAILCALL
+; CHECK-NEXT:  .LBB6_4: # %return
+; CHECK-NEXT:    retq
+entry:
+  %0 = icmp eq i32 %v, 0
+  br i1 %0, label %bbx, label %bby
+
+bby:
+  switch i32 %v, label %bb7 [
+    i32 16, label %return
+  ]
+
+bb7:
+  tail call void @tail_call_me()
+  ret void
+
+bbx:
+  switch i32 %v, label %bb12 [
+    i32 128, label %return
+  ]
+
+bb12:
+  tail call void @tail_call_me()
+  ret void
+
+return:
+  ret void
+}
+
 ; two - Same as one, but with two instructions in the common
 ; tail instead of one. This is too much to be merged, given
 ; the optsize attribute.
@@ -491,10 +532,51 @@
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je .LBB6_1
+; CHECK-NEXT:    je .LBB7_1
 ; CHECK-NEXT:  # %bb.2: # %return
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB6_1: # %bb7
+; CHECK-NEXT:  .LBB7_1: # %bb7
+; CHECK-NEXT:    movl $0, {{.*}}(%rip)
+; CHECK-NEXT:    movl $1, {{.*}}(%rip)
+entry:
+  %0 = icmp eq i32 undef, 0
+  br i1 %0, label %bbx, label %bby
+
+bby:
+  switch i32 undef, label %bb7 [
+    i32 16, label %return
+  ]
+
+bb7:
+  store volatile i32 0, i32* @XYZ
+  store volatile i32 1, i32* @XYZ
+  unreachable
+
+bbx:
+  switch i32 undef, label %bb12 [
+    i32 128, label %return
+  ]
+
+bb12:
+  store volatile i32 0, i32* @XYZ
+  store volatile i32 1, i32* @XYZ
+  unreachable
+
+return:
+  ret void
+}
+
+define void @two_pgso() nounwind !prof !14 {
+; CHECK-LABEL: two_pgso:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je .LBB8_1
+; CHECK-NEXT:  # %bb.2: # %return
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB8_1: # %bb7
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    movl $1, {{.*}}(%rip)
 entry:
@@ -534,10 +616,10 @@
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je .LBB7_1
+; CHECK-NEXT:    je .LBB9_1
 ; CHECK-NEXT:  # %bb.2: # %return
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB7_1: # %bb7
+; CHECK-NEXT:  .LBB9_1: # %bb7
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    movl $1, {{.*}}(%rip)
 entry:
@@ -575,20 +657,20 @@
 ; CHECK-LABEL: two_nosize:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    je .LBB8_3
+; CHECK-NEXT:    je .LBB10_3
 ; CHECK-NEXT:  # %bb.1: # %bby
 ; CHECK-NEXT:    testl %esi, %esi
-; CHECK-NEXT:    je .LBB8_4
+; CHECK-NEXT:    je .LBB10_4
 ; CHECK-NEXT:  # %bb.2: # %bb7
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    jmp tail_call_me # TAILCALL
-; CHECK-NEXT:  .LBB8_3: # %bbx
+; CHECK-NEXT:  .LBB10_3: # %bbx
 ; CHECK-NEXT:    cmpl $-1, %edx
-; CHECK-NEXT:    je .LBB8_4
+; CHECK-NEXT:    je .LBB10_4
 ; CHECK-NEXT:  # %bb.5: # %bb12
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    jmp tail_call_me # TAILCALL
-; CHECK-NEXT:  .LBB8_4: # %return
+; CHECK-NEXT:  .LBB10_4: # %return
 ; CHECK-NEXT:    retq
 entry:
   %0 = icmp eq i32 %x, 0
@@ -628,11 +710,11 @@
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    cmovgq %rdi, %rax
 ; CHECK-NEXT:    testq %rsi, %rsi
-; CHECK-NEXT:    jle .LBB9_2
+; CHECK-NEXT:    jle .LBB11_2
 ; CHECK-NEXT:  # %bb.1: # %bb.nph
 ; CHECK-NEXT:    imulq %rdi, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:  .LBB9_2: # %for.end
+; CHECK-NEXT:  .LBB11_2: # %for.end
 ; CHECK-NEXT:    retq
 entry:
   %cmp = icmp slt i64 %parami, 1                  ; <i1> [#uses=1]
@@ -661,24 +743,24 @@
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.1: # %cont1
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.2: # %cont2
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.3: # %cont3
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.4: # %cont4
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB10_5: # %abort1
+; CHECK-NEXT:  .LBB12_5: # %abort1
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq abort
 entry:
@@ -721,27 +803,27 @@
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_5
+; CHECK-NEXT:    je .LBB13_5
 ; CHECK-NEXT:  # %bb.1: # %cont1
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_6
+; CHECK-NEXT:    je .LBB13_6
 ; CHECK-NEXT:  # %bb.2: # %cont2
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_5
+; CHECK-NEXT:    je .LBB13_5
 ; CHECK-NEXT:  # %bb.3: # %cont3
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_6
+; CHECK-NEXT:    je .LBB13_6
 ; CHECK-NEXT:  # %bb.4: # %cont4
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB11_5: # %abort1
+; CHECK-NEXT:  .LBB13_5: # %abort1
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq abort
-; CHECK-NEXT:  .LBB11_6: # %abort2
+; CHECK-NEXT:  .LBB13_6: # %abort2
 ; CHECK-NEXT:    callq alt_abort
 entry:
   %c1 = call i1 @qux()
@@ -770,3 +852,20 @@
 cont4:
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/test-vs-bittest.ll b/llvm/test/CodeGen/X86/test-vs-bittest.ll
--- a/llvm/test/CodeGen/X86/test-vs-bittest.ll
+++ b/llvm/test/CodeGen/X86/test-vs-bittest.ll
@@ -49,6 +49,30 @@
   ret void
 }
 
+define void @test64_pgso(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jb .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB2_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 2048
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %yes, label %no
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
 ; This test is identical to test64 above with only the destination of the br
 ; reversed. This somehow causes the two functions to get slightly different
 ; initial IR. One has an extra invert of the setcc. This previous caused one
@@ -60,10 +84,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    je .LBB2_2
+; CHECK-NEXT:    je .LBB3_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB2_2: # %no
+; CHECK-NEXT:  .LBB3_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -84,10 +108,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jae .LBB3_2
+; CHECK-NEXT:    jae .LBB4_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB3_2: # %no
+; CHECK-NEXT:  .LBB4_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 2048
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test64_pgso_2(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jae .LBB5_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB5_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -108,10 +156,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jb .LBB4_2
+; CHECK-NEXT:    jb .LBB6_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB4_2: # %no
+; CHECK-NEXT:  .LBB6_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -132,10 +180,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jb .LBB5_2
+; CHECK-NEXT:    jb .LBB7_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB5_2: # %no
+; CHECK-NEXT:  .LBB7_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 4294967296
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %yes, label %no
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test64_pgso_3(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btq $32, %rdi
+; CHECK-NEXT:    jb .LBB8_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB8_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -156,10 +228,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jae .LBB6_2
+; CHECK-NEXT:    jae .LBB9_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB6_2: # %no
+; CHECK-NEXT:  .LBB9_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -180,10 +252,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jae .LBB7_2
+; CHECK-NEXT:    jae .LBB10_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB7_2: # %no
+; CHECK-NEXT:  .LBB10_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 4294967296
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test64_pgso_4(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btq $32, %rdi
+; CHECK-NEXT:    jae .LBB11_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB11_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -204,10 +300,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    jne .LBB8_2
+; CHECK-NEXT:    jne .LBB12_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB8_2: # %no
+; CHECK-NEXT:  .LBB12_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -228,10 +324,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jb .LBB9_2
+; CHECK-NEXT:    jb .LBB13_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB9_2: # %no
+; CHECK-NEXT:  .LBB13_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -252,10 +348,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    je .LBB10_2
+; CHECK-NEXT:    je .LBB14_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB10_2: # %no
+; CHECK-NEXT:  .LBB14_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -276,10 +372,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jae .LBB11_2
+; CHECK-NEXT:    jae .LBB15_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB11_2: # %no
+; CHECK-NEXT:  .LBB15_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i32 %x, 2048
+  %s = icmp eq i32 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test32_pgso_2(i32 inreg %x) !prof !14 {
+; CHECK-LABEL: test32_pgso_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jae .LBB16_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB16_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -300,10 +420,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    jne .LBB12_2
+; CHECK-NEXT:    jne .LBB17_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB12_2: # %no
+; CHECK-NEXT:  .LBB17_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -324,10 +444,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jb .LBB13_2
+; CHECK-NEXT:    jb .LBB18_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB13_2: # %no
+; CHECK-NEXT:  .LBB18_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i16 %x, 2048
+  %s = icmp eq i16 %t, 0
+  br i1 %s, label %yes, label %no
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test16_pgso(i16 inreg %x) !prof !14 {
+; CHECK-LABEL: test16_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jb .LBB19_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB19_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -348,10 +492,10 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    je .LBB14_2
+; CHECK-NEXT:    je .LBB20_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB14_2: # %no
+; CHECK-NEXT:  .LBB20_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -372,10 +516,34 @@
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jae .LBB15_2
+; CHECK-NEXT:    jae .LBB21_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB15_2: # %no
+; CHECK-NEXT:  .LBB21_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i16 %x, 2048
+  %s = icmp eq i16 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test16_pgso_2(i16 inreg %x) !prof !14 {
+; CHECK-LABEL: test16_pgso_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jae .LBB22_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB22_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -512,3 +680,20 @@
 }
 
 declare void @bar()
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2002,6 +2002,44 @@
   ret <8 x i32> %b
 }
 
+define <4 x double> @shuffle_v4f64_0zzz_pgso(<4 x double> %a) !prof !14 {
+; ALL-LABEL: shuffle_v4f64_0zzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; ALL-NEXT:    retq
+  %b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x double> %b
+}
+
+define <4 x i64> @shuffle_v4i64_0zzz_pgso(<4 x i64> %a) !prof !14 {
+; ALL-LABEL: shuffle_v4i64_0zzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; ALL-NEXT:    retq
+  %b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i64> %b
+}
+
+define <8 x float> @shuffle_v8f32_0zzzzzzz_pgso(<8 x float> %a) !prof !14 {
+; ALL-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; ALL-NEXT:    retq
+  %b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x float> %b
+}
+
+define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 {
+; ALL-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; ALL-NEXT:    retq
+  %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i32> %b
+}
+
 define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) {
 ; ALL-LABEL: unpckh_v4i64:
 ; ALL:       # %bb.0:
@@ -2022,3 +2060,19 @@
   ret <4 x double> %unpckh
 }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
--- a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
+++ b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
@@ -240,3 +240,140 @@
   %a = xor i64 %x, 9223372036854775808 ; toggle bit 63
   ret i64 %a
 }
+
+define i64 @and1_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and1_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $31, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 18446744071562067967 ; clear bit 31
+  ret i64 %a
+}
+
+define i64 @and2_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and2_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $32, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 18446744069414584319 ; clear bit 32
+  ret i64 %a
+}
+
+define i64 @and3_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and3_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $62, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 13835058055282163711 ; clear bit 62
+  ret i64 %a
+}
+
+define i64 @and4_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and4_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $63, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 9223372036854775807 ; clear bit 63
+  ret i64 %a
+}
+
+define i64 @or1_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or1_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $31, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 2147483648 ; set bit 31
+  ret i64 %a
+}
+
+define i64 @or2_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or2_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $32, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 4294967296 ; set bit 32
+  ret i64 %a
+}
+
+define i64 @or3_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or3_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $62, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 4611686018427387904 ; set bit 62
+  ret i64 %a
+}
+
+define i64 @or4_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or4_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $63, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 9223372036854775808 ; set bit 63
+  ret i64 %a
+}
+
+define i64 @xor1_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor1_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $31, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 2147483648 ; toggle bit 31
+  ret i64 %a
+}
+
+define i64 @xor2_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor2_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $32, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 4294967296 ; toggle bit 32
+  ret i64 %a
+}
+
+define i64 @xor3_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor3_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $62, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 4611686018427387904 ; toggle bit 62
+  ret i64 %a
+}
+
+define i64 @xor4_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor4_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $63, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 9223372036854775808 ; toggle bit 63
+  ret i64 %a
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
--- a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -50,6 +50,19 @@
   ret i64 %or
 }
 
+define i64 @_Z8lshift11mm_pgso(i64 %a, i64 %b) !prof !14 {
+; CHECK-LABEL: _Z8lshift11mm_pgso:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shldq $11, %rsi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %shl = shl i64 %a, 11
+  %shr = lshr i64 %b, 53
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
 attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 ; clang -O2 -c test2.cpp -emit-llvm -S
@@ -78,3 +91,19 @@
 
 attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll b/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll
--- a/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll
+++ b/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll
@@ -25,6 +25,26 @@
   ret void
 }
 
+define void @f_pgso(i8* %p, i8* %q, i32* inalloca nocapture %unused) !prof !14 {
+entry:
+  %g = alloca %struct.T, align 8
+  %r = alloca i32, align 8
+  store i32 0, i32* %r, align 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %p, i8* align 8 %q, i32 24, i1 false)
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %load = load i32, i32* %r, align 4
+  %dec = add nsw i32 %load, -1
+  store i32 %dec, i32* %r, align 4
+  call void @g(%struct.T* %g)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body
+  ret void
+}
+
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) #1
 
@@ -46,5 +66,38 @@
 ; CHECK:     testb    %[[NE_REG]], %[[NE_REG]]
 ; CHECK:     jne
 
+; CHECK-LABEL: _f_pgso:
+; CHECK:     pushl %ebp
+; CHECK:     movl %esp, %ebp
+; CHECK:     andl $-8, %esp
+; CHECK-NOT: movl %esp, %esi
+; CHECK:     rep;movsl
+; CHECK:     leal 8(%esp), %esi
+
+; CHECK:     decl     (%esp)
+; CHECK:     setne    %[[NE_REG:.*]]
+; CHECK:     pushl     %esi
+; CHECK:     calll     _g
+; CHECK:     addl     $4, %esp
+; CHECK:     testb    %[[NE_REG]], %[[NE_REG]]
+; CHECK:     jne
+
 attributes #0 = { nounwind optsize }
 attributes #1 = { argmemonly nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
--- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
@@ -152,6 +152,30 @@
   br label %fallthrough
 }
 
+; Negative test - opt for size
+define void @test6_pgso(i1 %cond, i64* %base) !prof !14 {
+; CHECK-LABEL: @test6
+entry:
+; CHECK: %addr = getelementptr
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK-NOT: getelementptr inbounds i8, {{.+}} 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+  call void @slowpath(i32 %v1, i32* %casted) cold
+  br label %fallthrough
+}
 
 ; Make sure sinking two copies of addressing mode into different blocks works
 ; when there are cold paths for each.
@@ -278,3 +302,20 @@
   store i1 false, i1* %G23
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
--- a/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -66,7 +66,7 @@
     if (!DAG)
       report_fatal_error("DAG?");
     OptimizationRemarkEmitter ORE(F);
-    DAG->init(*MF, ORE, nullptr, nullptr, nullptr);
+    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr);
   }
 
   LLVMContext Context;
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -18,6 +18,7 @@
   MachineInstrBundleIteratorTest.cpp
   MachineInstrTest.cpp
   MachineOperandTest.cpp
+  MachineSizeOptsTest.cpp
   ScalableVectorMVTsTest.cpp
   TypeTraitsTest.cpp
   TargetOptionsTest.cpp
diff --git a/llvm/unittests/CodeGen/MachineSizeOptsTest.cpp b/llvm/unittests/CodeGen/MachineSizeOptsTest.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/unittests/CodeGen/MachineSizeOptsTest.cpp
@@ -0,0 +1,234 @@
+//===- MachineSizeOptsTest.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
+  auto TT(Triple::normalize("x86_64--"));
+  std::string Error;
+  const Target *TheTarget = TargetRegistry::lookupTarget(TT, Error);
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+      TheTarget->createTargetMachine(TT, "", "", TargetOptions(), None, None,
+                                     CodeGenOpt::Default)));
+}
+
+class MachineSizeOptsTest : public testing::Test {
+ protected:
+  static const char* MIRString;
+  LLVMContext Context;
+  std::unique_ptr<LLVMTargetMachine> TM;
+  std::unique_ptr<MachineModuleInfo> MMI;
+  std::unique_ptr<MIRParser> Parser;
+  std::unique_ptr<Module> M;
+  struct BFIData {
+    std::unique_ptr<MachineDominatorTree> MDT;
+    std::unique_ptr<MachineLoopInfo> MLI;
+    std::unique_ptr<MachineBranchProbabilityInfo> MBPI;
+    std::unique_ptr<MachineBlockFrequencyInfo> MBFI;
+    BFIData(MachineFunction &MF) {
+      MDT.reset(new MachineDominatorTree(MF));
+      MLI.reset(new MachineLoopInfo(*MDT));
+      MBPI.reset(new MachineBranchProbabilityInfo());
+      MBFI.reset(new MachineBlockFrequencyInfo(MF, *MBPI, *MLI));
+    }
+    MachineBlockFrequencyInfo *get() { return MBFI.get(); }
+  };
+
+  static void SetUpTestCase() {
+    InitializeAllTargets();
+    InitializeAllTargetMCs();
+  }
+
+  void SetUp() override {
+    TM = createTargetMachine();
+    std::unique_ptr<MemoryBuffer> MBuffer =
+        MemoryBuffer::getMemBuffer(MIRString);
+    Parser = createMIRParser(std::move(MBuffer), Context);
+    if (!Parser)
+      report_fatal_error("null MIRParser");
+    M = Parser->parseIRModule();
+    if (!M)
+      report_fatal_error("parseIRModule failed");
+    M->setTargetTriple(TM->getTargetTriple().getTriple());
+    M->setDataLayout(TM->createDataLayout());
+    MMI = std::make_unique<MachineModuleInfo>(TM.get());
+    if (Parser->parseMachineFunctions(*M, *MMI.get()))
+      report_fatal_error("parseMachineFunctions failed");
+  }
+
+  MachineFunction *getMachineFunction(Module *M, StringRef Name) {
+    auto F = M->getFunction(Name);
+    if (!F)
+      report_fatal_error("null Function");
+    auto &MF = MMI->getOrCreateMachineFunction(*F);
+    return &MF;
+  }
+};
+
+TEST_F(MachineSizeOptsTest, Test) {
+  MachineFunction *F = getMachineFunction(M.get(), "f");
+  ASSERT_TRUE(F != nullptr);
+  MachineFunction *G = getMachineFunction(M.get(), "g");
+  ASSERT_TRUE(G != nullptr);
+  MachineFunction *H = getMachineFunction(M.get(), "h");
+  ASSERT_TRUE(H != nullptr);
+  ProfileSummaryInfo PSI = ProfileSummaryInfo(*M.get());
+  ASSERT_TRUE(PSI.hasProfileSummary());
+  BFIData BFID_F(*F);
+  BFIData BFID_G(*G);
+  BFIData BFID_H(*H);
+  MachineBlockFrequencyInfo *MBFI_F = BFID_F.get();
+  MachineBlockFrequencyInfo *MBFI_G = BFID_G.get();
+  MachineBlockFrequencyInfo *MBFI_H = BFID_H.get();
+  MachineBasicBlock &BB0 = F->front();
+  auto iter = BB0.succ_begin();
+  MachineBasicBlock *BB1 = *iter;
+  iter++;
+  MachineBasicBlock *BB2 = *iter;
+  iter++;
+  ASSERT_TRUE(iter == BB0.succ_end());
+  MachineBasicBlock *BB3 = *BB1->succ_begin();
+  ASSERT_TRUE(BB3 == *BB2->succ_begin());
+  EXPECT_FALSE(shouldOptimizeForSize(F, &PSI, MBFI_F));
+  EXPECT_TRUE(shouldOptimizeForSize(G, &PSI, MBFI_G));
+  EXPECT_FALSE(shouldOptimizeForSize(H, &PSI, MBFI_H));
+  EXPECT_FALSE(shouldOptimizeForSize(&BB0, &PSI, MBFI_F));
+  EXPECT_FALSE(shouldOptimizeForSize(BB1, &PSI, MBFI_F));
+  EXPECT_TRUE(shouldOptimizeForSize(BB2, &PSI, MBFI_F));
+  EXPECT_FALSE(shouldOptimizeForSize(BB3, &PSI, MBFI_F));
+}
+
+const char* MachineSizeOptsTest::MIRString = R"MIR(
+--- |
+  define i32 @g(i32 %x) !prof !14 {
+    ret i32 0
+  }
+
+  define i32 @h(i32 %x) !prof !15 {
+    ret i32 0
+  }
+
+  define i32 @f(i32 %x) !prof !16 {
+  bb0:
+    %y1 = icmp eq i32 %x, 0
+    br i1 %y1, label %bb1, label %bb2, !prof !17
+
+  bb1:                                              ; preds = %bb0
+    %z1 = call i32 @g(i32 %x)
+    br label %bb3
+
+  bb2:                                              ; preds = %bb0
+    %z2 = call i32 @h(i32 %x)
+    br label %bb3
+
+  bb3:                                              ; preds = %bb2, %bb1
+    %y2 = phi i32 [ 0, %bb1 ], [ 1, %bb2 ]
+    ret i32 %y2
+  }
+
+  !llvm.module.flags = !{!0}
+
+  !0 = !{i32 1, !"ProfileSummary", !1}
+  !1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+  !2 = !{!"ProfileFormat", !"InstrProf"}
+  !3 = !{!"TotalCount", i64 10000}
+  !4 = !{!"MaxCount", i64 10}
+  !5 = !{!"MaxInternalCount", i64 1}
+  !6 = !{!"MaxFunctionCount", i64 1000}
+  !7 = !{!"NumCounts", i64 3}
+  !8 = !{!"NumFunctions", i64 3}
+  !9 = !{!"DetailedSummary", !10}
+  !10 = !{!11, !12, !13}
+  !11 = !{i32 10000, i64 1000, i32 1}
+  !12 = !{i32 999000, i64 300, i32 3}
+  !13 = !{i32 999999, i64 5, i32 10}
+  !14 = !{!"function_entry_count", i64 1}
+  !15 = !{!"function_entry_count", i64 100}
+  !16 = !{!"function_entry_count", i64 400}
+  !17 = !{!"branch_weights", i32 100, i32 1}
+
+...
+---
+name:            g
+body:             |
+  bb.0:
+    %1:gr32 = MOV32r0 implicit-def dead $eflags
+    $eax = COPY %1
+    RET 0, $eax
+
+...
+---
+name:            h
+body:             |
+  bb.0:
+    %1:gr32 = MOV32r0 implicit-def dead $eflags
+    $eax = COPY %1
+    RET 0, $eax
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1(0x7ebb907a), %bb.2(0x01446f86)
+    liveins: $edi
+
+    %1:gr32 = COPY $edi
+    TEST32rr %1, %1, implicit-def $eflags
+    JCC_1 %bb.2, 5, implicit $eflags
+    JMP_1 %bb.1
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $edi = COPY %1
+    CALL64pcrel32 @g, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %5:gr32 = COPY $eax
+    %4:gr32 = MOV32r0 implicit-def dead $eflags
+    JMP_1 %bb.3
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $edi = COPY %1
+    CALL64pcrel32 @h, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %3:gr32 = COPY $eax
+    %2:gr32 = MOV32ri 1
+
+  bb.3:
+    %0:gr32 = PHI %2, %bb.2, %4, %bb.1
+    $eax = COPY %0
+    RET 0, $eax
+
+...
+)MIR";
+
+} // anonymous namespace
diff --git a/llvm/unittests/Transforms/Utils/CMakeLists.txt b/llvm/unittests/Transforms/Utils/CMakeLists.txt
--- a/llvm/unittests/Transforms/Utils/CMakeLists.txt
+++ b/llvm/unittests/Transforms/Utils/CMakeLists.txt
@@ -14,6 +14,7 @@
   FunctionComparatorTest.cpp
   IntegerDivisionTest.cpp
   LocalTest.cpp
+  SizeOptsTest.cpp
   SSAUpdaterBulkTest.cpp
   UnrollLoopTest.cpp
   ValueMapperTest.cpp
diff --git a/llvm/unittests/Transforms/Utils/SizeOptsTest.cpp b/llvm/unittests/Transforms/Utils/SizeOptsTest.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/unittests/Transforms/Utils/SizeOptsTest.cpp
@@ -0,0 +1,129 @@
+//===- SizeOptsTest.cpp - SizeOpts unit tests -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+class SizeOptsTest : public testing::Test {
+protected:
+  static const char* IRString;
+  LLVMContext C;
+  std::unique_ptr<Module> M;
+  struct BFIData {
+    std::unique_ptr<DominatorTree> DT;
+    std::unique_ptr<LoopInfo> LI;
+    std::unique_ptr<BranchProbabilityInfo> BPI;
+    std::unique_ptr<BlockFrequencyInfo> BFI;
+    BFIData(Function &F) {
+      DT.reset(new DominatorTree(F));
+      LI.reset(new LoopInfo(*DT));
+      BPI.reset(new BranchProbabilityInfo(F, *LI));
+      BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
+    }
+    BlockFrequencyInfo *get() { return BFI.get(); }
+  };
+
+  void SetUp() override {
+    SMDiagnostic Err;
+    M = parseAssemblyString(IRString, Err, C);
+  }
+};
+
+TEST_F(SizeOptsTest, Test) {
+  Function *F = M->getFunction("f");
+  Function *G = M->getFunction("g");
+  Function *H = M->getFunction("h");
+
+  ProfileSummaryInfo PSI(*M.get());
+  BFIData BFID_F(*F);
+  BFIData BFID_G(*G);
+  BFIData BFID_H(*H);
+  BlockFrequencyInfo *BFI_F = BFID_F.get();
+  BlockFrequencyInfo *BFI_G = BFID_G.get();
+  BlockFrequencyInfo *BFI_H = BFID_H.get();
+  BasicBlock &BB0 = F->getEntryBlock();
+  BasicBlock *BB1 = BB0.getTerminator()->getSuccessor(0);
+  BasicBlock *BB2 = BB0.getTerminator()->getSuccessor(1);
+  BasicBlock *BB3 = BB1->getSingleSuccessor();
+
+  EXPECT_TRUE(PSI.hasProfileSummary());
+  EXPECT_FALSE(shouldOptimizeForSize(F, &PSI, BFI_F));
+  EXPECT_TRUE(shouldOptimizeForSize(G, &PSI, BFI_G));
+  EXPECT_FALSE(shouldOptimizeForSize(H, &PSI, BFI_H));
+  EXPECT_FALSE(shouldOptimizeForSize(&BB0, &PSI, BFI_F));
+  EXPECT_FALSE(shouldOptimizeForSize(BB1, &PSI, BFI_F));
+  EXPECT_TRUE(shouldOptimizeForSize(BB2, &PSI, BFI_F));
+  EXPECT_FALSE(shouldOptimizeForSize(BB3, &PSI, BFI_F));
+}
+
+const char* SizeOptsTest::IRString = R"IR(
+  define i32 @g(i32 %x) !prof !14 {
+    ret i32 0
+  }
+
+  define i32 @h(i32 %x) !prof !15 {
+    ret i32 0
+  }
+
+  define i32 @f(i32 %x) !prof !16 {
+  bb0:
+    %y1 = icmp eq i32 %x, 0
+    br i1 %y1, label %bb1, label %bb2, !prof !17
+
+  bb1:                                              ; preds = %bb0
+    %z1 = call i32 @g(i32 %x)
+    br label %bb3
+
+  bb2:                                              ; preds = %bb0
+    %z2 = call i32 @h(i32 %x)
+    br label %bb3
+
+  bb3:                                              ; preds = %bb2, %bb1
+    %y2 = phi i32 [ 0, %bb1 ], [ 1, %bb2 ]
+    ret i32 %y2
+  }
+
+  !llvm.module.flags = !{!0}
+
+  !0 = !{i32 1, !"ProfileSummary", !1}
+  !1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+  !2 = !{!"ProfileFormat", !"InstrProf"}
+  !3 = !{!"TotalCount", i64 10000}
+  !4 = !{!"MaxCount", i64 10}
+  !5 = !{!"MaxInternalCount", i64 1}
+  !6 = !{!"MaxFunctionCount", i64 1000}
+  !7 = !{!"NumCounts", i64 3}
+  !8 = !{!"NumFunctions", i64 3}
+  !9 = !{!"DetailedSummary", !10}
+  !10 = !{!11, !12, !13}
+  !11 = !{i32 10000, i64 1000, i32 1}
+  !12 = !{i32 999000, i64 300, i32 3}
+  !13 = !{i32 999999, i64 5, i32 10}
+  !14 = !{!"function_entry_count", i64 1}
+  !15 = !{!"function_entry_count", i64 100}
+  !16 = !{!"function_entry_count", i64 400}
+  !17 = !{!"branch_weights", i32 100, i32 1}
+)IR";
+
+} // end anonymous namespace
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -5129,6 +5129,14 @@
   SubtargetFeatureInfo::emitComputeAvailableFeatures(
       Target.getName(), "InstructionSelector", "computeAvailableModuleFeatures",
       ModuleFeatures, OS);
+
+  if (Target.getName() == "X86") {
+    // TODO: Implement PGSO.
+    OS << "static bool shouldOptForSize(const MachineFunction *MF) {\n";
+    OS << "    return MF->getFunction().hasOptSize();\n";
+    OS << "}\n\n";
+  }
+
   SubtargetFeatureInfo::emitComputeAvailableFeatures(
       Target.getName(), "InstructionSelector",
       "computeAvailableFunctionFeatures", FunctionFeatures, OS,