diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -202,9 +202,10 @@
       Function *F, std::function<const LoopAccessInfo &(Loop &)> *GetLAA,
       LoopInfo *LI, OptimizationRemarkEmitter *ORE,
       LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB,
-      AssumptionCache *AC)
+      AssumptionCache *AC, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
       : TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT),
-        GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
+        GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC),
+        BFI(BFI), PSI(PSI) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -478,6 +479,10 @@
   /// Assume instructions in predicated blocks must be dropped if the CFG gets
   /// flattened.
   SmallPtrSet<Instruction *, 8> ConditionalAssumes;
+
+  /// BFI and PSI are used to check for profile guided size optimizations.
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3310,7 +3310,7 @@
     // Check to see if this FP immediate is already legal.
     // If this is a legal constant, turn it into a TargetConstantFP node.
     if (!TLI.isFPImmLegal(CFP->getValueAPF(), Node->getValueType(0),
-                          DAG.getMachineFunction().getFunction().hasOptSize()))
+                          DAG.shouldOptForSize()))
       Results.push_back(ExpandConstantFP(CFP, true));
     break;
   }
diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp
--- a/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -16,8 +16,11 @@
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/Debug.h"
@@ -111,6 +114,12 @@
         MachineFunctionProperties::Property::NoVRegs);
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
 private:
   TargetSchedModel TSM;
   const X86InstrInfo *TII = nullptr;
@@ -205,21 +214,27 @@
   TSM.init(&ST);
   TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *MBFI = (PSI && PSI->hasProfileSummary())
+                   ? &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI()
+                   : nullptr;
 
   LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   for (MachineBasicBlock &MBB : MF) {
     // First pass. Try to remove or optimize existing LEAs.
+    bool OptIncDecPerBB =
+        OptIncDec || llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
     for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
       if (!isLEA(I->getOpcode()))
         continue;
 
-      if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP))
+      if (optTwoAddrLEA(I, MBB, OptIncDecPerBB, UseLEAForSP))
         continue;
 
       if (IsSlowLEA)
         processInstructionForSlowLEA(I, MBB);
       else if (IsSlow3OpsLEA)
-        processInstrForSlow3OpLEA(I, MBB, OptIncDec);
+        processInstrForSlow3OpLEA(I, MBB, OptIncDecPerBB);
     }
 
     // Second pass for creating LEAs. This may reverse some of the
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4526,7 +4526,7 @@
     // the patterns on the add/sub/and/or/xor with immediate paterns in the
     // tablegen files to check immediate use count without making the patterns
     // unavailable to the fast-isel table.
-    if (!OptForSize)
+    if (!CurDAG->shouldOptForSize())
       break;
 
     // Only handle i8/i16/i32/i64.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34378,7 +34378,7 @@
     return DAG.getBitcast(RootVT, V1);
   }
 
-  bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+  bool OptForSize = DAG.shouldOptForSize();
   unsigned RootSizeInBits = RootVT.getSizeInBits();
   unsigned NumRootElts = RootVT.getVectorNumElements();
   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
@@ -39218,7 +39218,7 @@
   }
 
   // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
-  bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+  bool OptForSize = DAG.shouldOptForSize();
   if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
     return SDValue();
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 
 using namespace llvm;
@@ -412,7 +413,11 @@
   const ValueToValueMap &Strides =
       getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
 
-  bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize();
+  Function *F = TheLoop->getHeader()->getParent();
+  bool OptForSize = F->hasOptSize() ||
+                    llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
+                                                PGSOQueryType::IRPass);
+  bool CanAddPredicate = !OptForSize;
   int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
   if (Stride == 1 || Stride == -1)
     return Stride;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -395,11 +395,13 @@
                       const TargetTransformInfo *TTI, AssumptionCache *AC,
                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
-                      LoopVectorizationCostModel *CM)
+                      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+                      ProfileSummaryInfo *PSI)
       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
         Builder(PSE.getSE()->getContext()),
-        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
+        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
+        BFI(BFI), PSI(PSI) {}
   virtual ~InnerLoopVectorizer() = default;
 
   /// Create a new empty loop. Unlink the old loop and connect the new one.
@@ -779,6 +781,10 @@
   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
   // fixed up at the end of vector code generation.
   SmallVector<PHINode *, 8> OrigPHIsToFix;
+
+  /// BFI and PSI are used to check for profile guided size optimizations.
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -789,9 +795,10 @@
                     const TargetTransformInfo *TTI, AssumptionCache *AC,
                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
                     LoopVectorizationLegality *LVL,
-                    LoopVectorizationCostModel *CM)
+                    LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+                    ProfileSummaryInfo *PSI)
       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
-                            UnrollFactor, LVL, CM) {}
+                            UnrollFactor, LVL, CM, BFI, PSI) {}
 
 private:
   Value *getBroadcastInstrs(Value *V) override;
@@ -2754,7 +2761,9 @@
     if (C->isZero())
       return;
 
-  assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
+  assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
+           llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                       PGSOQueryType::IRPass)) &&
          "Cannot SCEV check stride or overflow when optimizing for size");
 
   SCEVCheckBlock->setName("vector.scevcheck");
@@ -2800,7 +2809,9 @@
   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
                             "claimed checks are required");
 
-  if (MemCheckBlock->getParent()->hasOptSize()) {
+  if (MemCheckBlock->getParent()->hasOptSize() ||
+      llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                  PGSOQueryType::IRPass)) {
     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
            "Cannot emit memory checks when optimizing for size, unless forced "
            "to vectorize.");
@@ -7716,7 +7727,7 @@
   LVP.setBestPlan(VF.Width, 1);
 
   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
-                         &CM);
+                         &CM, BFI, PSI);
   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                     << L->getHeader()->getParent()->getName() << "\"\n");
   LVP.executePlan(LB, DT);
@@ -7780,7 +7791,7 @@
   // Check if it is legal to vectorize the loop.
   LoopVectorizationRequirements Requirements(*ORE);
   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
-                                &Requirements, &Hints, DB, AC);
+                                &Requirements, &Hints, DB, AC, BFI, PSI);
   if (!LVL.canVectorize(EnableVPlanNativePath)) {
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
     Hints.emitRemarkWithHints();
@@ -7980,8 +7991,8 @@
     assert(IC > 1 && "interleave count should not be 1 or 0");
     // If we decided that it is not legal to vectorize the loop, then
     // interleave it.
-    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
-                               &CM);
+    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
+                               BFI, PSI);
     LVP.executePlan(Unroller, DT);
 
     ORE->emit([&]() {
@@ -7993,7 +8004,7 @@
   } else {
     // If we decided that it is *legal* to vectorize the loop, then do it.
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
-                           &LVL, &CM);
+                           &LVL, &CM, BFI, PSI);
     LVP.executePlan(LB, DT);
     ++LoopsVectorized;
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-fp-imm-size.ll b/llvm/test/CodeGen/AArch64/arm64-fp-imm-size.ll
--- a/llvm/test/CodeGen/AArch64/arm64-fp-imm-size.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fp-imm-size.ll
@@ -38,3 +38,38 @@
 ; CHECK-NEXT:  ret
   ret fp128 0xL00000000000000000000000000000000
 }
+
+; CHECK: literal8
+; CHECK: .quad 0x0000001fffffffd
+define double @foo2_pgso() !prof !14 {
+; CHECK: _foo2_pgso:
+; CHECK: adrp x[[REG:[0-9]+]], lCPI4_0@PAGE
+; CHECK: ldr  d0, [x[[REG]], lCPI4_0@PAGEOFF]
+; CHECK-NEXT: ret
+  ret double 0x1FFFFFFFd1
+}
+
+define float @bar_pgso() !prof !14 {
+; CHECK: _bar_pgso:
+; CHECK: adrp x[[REG:[0-9]+]], lCPI5_0@PAGE
+; CHECK: ldr  s0, [x[[REG]], lCPI5_0@PAGEOFF]
+; CHECK-NEXT:  ret
+  ret float 0x400921FB80000000
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/fixup-lea.ll b/llvm/test/CodeGen/X86/fixup-lea.ll
--- a/llvm/test/CodeGen/X86/fixup-lea.ll
+++ b/llvm/test/CodeGen/X86/fixup-lea.ll
@@ -109,31 +109,18 @@
 }
 
 define void @foo_pgso(i32 inreg %dns) !prof !14 {
-; SLOW-LABEL: foo_pgso:
-; SLOW:       # %bb.0: # %entry
-; SLOW-NEXT:    xorl %ecx, %ecx
-; SLOW-NEXT:    decl %ecx
-; SLOW-NEXT:  .LBB4_1: # %for.body
-; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
-; SLOW-NEXT:    movzwl %cx, %edx
-; SLOW-NEXT:    decl %ecx
-; SLOW-NEXT:    cmpl %eax, %edx
-; SLOW-NEXT:    jl .LBB4_1
-; SLOW-NEXT:  # %bb.2: # %for.end
-; SLOW-NEXT:    retl
-;
-; FAST-LABEL: foo_pgso:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xorl %ecx, %ecx
-; FAST-NEXT:    decl %ecx
-; FAST-NEXT:  .LBB4_1: # %for.body
-; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
-; FAST-NEXT:    movzwl %cx, %edx
-; FAST-NEXT:    addl $-1, %ecx
-; FAST-NEXT:    cmpl %eax, %edx
-; FAST-NEXT:    jl .LBB4_1
-; FAST-NEXT:  # %bb.2: # %for.end
-; FAST-NEXT:    retl
+; CHECK-LABEL: foo_pgso:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    decl %ecx
+; CHECK-NEXT:  .LBB4_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movzwl %cx, %edx
+; CHECK-NEXT:    decl %ecx
+; CHECK-NEXT:    cmpl %eax, %edx
+; CHECK-NEXT:    jl .LBB4_1
+; CHECK-NEXT:  # %bb.2: # %for.end
+; CHECK-NEXT:    retl
 entry:
   br label %for.body
 
@@ -149,31 +136,18 @@
 }
 
 define void @bar_pgso(i32 inreg %dns) !prof !14 {
-; SLOW-LABEL: bar_pgso:
-; SLOW:       # %bb.0: # %entry
-; SLOW-NEXT:    xorl %ecx, %ecx
-; SLOW-NEXT:    incl %ecx
-; SLOW-NEXT:  .LBB5_1: # %for.body
-; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
-; SLOW-NEXT:    movzwl %cx, %edx
-; SLOW-NEXT:    incl %ecx
-; SLOW-NEXT:    cmpl %eax, %edx
-; SLOW-NEXT:    jl .LBB5_1
-; SLOW-NEXT:  # %bb.2: # %for.end
-; SLOW-NEXT:    retl
-;
-; FAST-LABEL: bar_pgso:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xorl %ecx, %ecx
-; FAST-NEXT:    incl %ecx
-; FAST-NEXT:  .LBB5_1: # %for.body
-; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
-; FAST-NEXT:    movzwl %cx, %edx
-; FAST-NEXT:    addl $1, %ecx
-; FAST-NEXT:    cmpl %eax, %edx
-; FAST-NEXT:    jl .LBB5_1
-; FAST-NEXT:  # %bb.2: # %for.end
-; FAST-NEXT:    retl
+; CHECK-LABEL: bar_pgso:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:  .LBB5_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movzwl %cx, %edx
+; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    cmpl %eax, %edx
+; CHECK-NEXT:    jl .LBB5_1
+; CHECK-NEXT:  # %bb.2: # %for.end
+; CHECK-NEXT:    retl
 entry:
   br label %for.body
 
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -184,6 +184,7 @@
 ; CHECK-NEXT:       X86 Byte/Word Instruction Fixup
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 Atom pad short functions
+; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 LEA Fixup
 ; CHECK-NEXT:       Compressing EVEX instrs to VEX encoding when possible
 ; CHECK-NEXT:       X86 Discriminate Memory Operands
diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll
--- a/llvm/test/CodeGen/X86/phaddsub-extract.ll
+++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll
@@ -2094,6 +2094,28 @@
   ret i32 %x230
 }
 
+define i32 @hadd32_4_pgso(<4 x i32> %x225) !prof !14 {
+; SSE3-LABEL: hadd32_4_pgso:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    phaddd %xmm0, %xmm0
+; SSE3-NEXT:    phaddd %xmm0, %xmm0
+; SSE3-NEXT:    movd %xmm0, %eax
+; SSE3-NEXT:    retq
+;
+; AVX-LABEL: hadd32_4_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
+  %x226 = shufflevector <4 x i32> %x225, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %x227 = add <4 x i32> %x225, %x226
+  %x228 = shufflevector <4 x i32> %x227, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %x229 = add <4 x i32> %x227, %x228
+  %x230 = extractelement <4 x i32> %x229, i32 0
+  ret i32 %x230
+}
+
 define i32 @hadd32_8_optsize(<8 x i32> %x225) optsize {
 ; SSE3-LABEL: hadd32_8_optsize:
 ; SSE3:       # %bb.0:
@@ -2141,3 +2163,20 @@
   %x230 = extractelement <16 x i32> %x229, i32 0
   ret i32 %x230
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -1034,8 +1034,454 @@
   ret i128 %cnt
 }
 
+define i32 @cnt32_pgso(i32 %x) nounwind readnone !prof !14 {
+; X32-LABEL: cnt32_pgso:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl %ecx
+; X32-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X32-NEXT:    subl %ecx, %eax
+; X32-NEXT:    movl $858993459, %ecx # imm = 0x33333333
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    andl %ecx, %edx
+; X32-NEXT:    shrl $2, %eax
+; X32-NEXT:    andl %ecx, %eax
+; X32-NEXT:    addl %edx, %eax
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $4, %ecx
+; X32-NEXT:    addl %eax, %ecx
+; X32-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X32-NEXT:    imull $16843009, %ecx, %eax # imm = 0x1010101
+; X32-NEXT:    shrl $24, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: cnt32_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X64-NEXT:    subl %eax, %edi
+; X64-NEXT:    movl $858993459, %eax # imm = 0x33333333
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    andl %eax, %ecx
+; X64-NEXT:    shrl $2, %edi
+; X64-NEXT:    andl %eax, %edi
+; X64-NEXT:    addl %ecx, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; X64-NEXT:    shrl $24, %eax
+; X64-NEXT:    retq
+;
+; X32-POPCNT-LABEL: cnt32_pgso:
+; X32-POPCNT:       # %bb.0:
+; X32-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %eax
+; X32-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: cnt32_pgso:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    popcntl %edi, %eax
+; X64-POPCNT-NEXT:    retq
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+}
+
+define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 {
+; X32-NOSSE-LABEL: cnt64_pgso:
+; X32-NOSSE:       # %bb.0:
+; X32-NOSSE-NEXT:    pushl %ebx
+; X32-NOSSE-NEXT:    pushl %edi
+; X32-NOSSE-NEXT:    pushl %esi
+; X32-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOSSE-NEXT:    movl %ecx, %edx
+; X32-NOSSE-NEXT:    shrl %edx
+; X32-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
+; X32-NOSSE-NEXT:    andl %esi, %edx
+; X32-NOSSE-NEXT:    subl %edx, %ecx
+; X32-NOSSE-NEXT:    movl $858993459, %edx # imm = 0x33333333
+; X32-NOSSE-NEXT:    movl %ecx, %edi
+; X32-NOSSE-NEXT:    andl %edx, %edi
+; X32-NOSSE-NEXT:    shrl $2, %ecx
+; X32-NOSSE-NEXT:    andl %edx, %ecx
+; X32-NOSSE-NEXT:    addl %edi, %ecx
+; X32-NOSSE-NEXT:    movl %ecx, %edi
+; X32-NOSSE-NEXT:    shrl $4, %edi
+; X32-NOSSE-NEXT:    addl %ecx, %edi
+; X32-NOSSE-NEXT:    movl $252645135, %ecx # imm = 0xF0F0F0F
+; X32-NOSSE-NEXT:    andl %ecx, %edi
+; X32-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
+; X32-NOSSE-NEXT:    shrl $24, %edi
+; X32-NOSSE-NEXT:    movl %eax, %ebx
+; X32-NOSSE-NEXT:    shrl %ebx
+; X32-NOSSE-NEXT:    andl %esi, %ebx
+; X32-NOSSE-NEXT:    subl %ebx, %eax
+; X32-NOSSE-NEXT:    movl %eax, %esi
+; X32-NOSSE-NEXT:    andl %edx, %esi
+; X32-NOSSE-NEXT:    shrl $2, %eax
+; X32-NOSSE-NEXT:    andl %edx, %eax
+; X32-NOSSE-NEXT:    addl %esi, %eax
+; X32-NOSSE-NEXT:    movl %eax, %edx
+; X32-NOSSE-NEXT:    shrl $4, %edx
+; X32-NOSSE-NEXT:    addl %eax, %edx
+; X32-NOSSE-NEXT:    andl %ecx, %edx
+; X32-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X32-NOSSE-NEXT:    shrl $24, %eax
+; X32-NOSSE-NEXT:    addl %edi, %eax
+; X32-NOSSE-NEXT:    xorl %edx, %edx
+; X32-NOSSE-NEXT:    popl %esi
+; X32-NOSSE-NEXT:    popl %edi
+; X32-NOSSE-NEXT:    popl %ebx
+; X32-NOSSE-NEXT:    retl
+;
+; X64-LABEL: cnt64_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    subq %rcx, %rdi
+; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    shrq $2, %rdi
+; X64-NEXT:    andq %rax, %rdi
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $4, %rax
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    shrq $56, %rax
+; X64-NEXT:    retq
+;
+; X32-POPCNT-LABEL: cnt64_pgso:
+; X32-POPCNT:       # %bb.0:
+; X32-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
+; X32-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %eax
+; X32-POPCNT-NEXT:    addl %ecx, %eax
+; X32-POPCNT-NEXT:    xorl %edx, %edx
+; X32-POPCNT-NEXT:    retl
+;
+; X64-POPCNT-LABEL: cnt64_pgso:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCNT-NEXT:    retq
+;
+; X32-SSE2-LABEL: cnt64_pgso:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrlw $1, %xmm1
+; X32-SSE2-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE2-NEXT:    psubb %xmm1, %xmm0
+; X32-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE2-NEXT:    pand %xmm1, %xmm2
+; X32-SSE2-NEXT:    psrlw $2, %xmm0
+; X32-SSE2-NEXT:    pand %xmm1, %xmm0
+; X32-SSE2-NEXT:    paddb %xmm2, %xmm0
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrlw $4, %xmm1
+; X32-SSE2-NEXT:    paddb %xmm0, %xmm1
+; X32-SSE2-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X32-SSE2-NEXT:    psadbw %xmm1, %xmm0
+; X32-SSE2-NEXT:    movd %xmm0, %eax
+; X32-SSE2-NEXT:    xorl %edx, %edx
+; X32-SSE2-NEXT:    retl
+;
+; X32-SSSE3-LABEL: cnt64_pgso:
+; X32-SSSE3:       # %bb.0:
+; X32-SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X32-SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSSE3-NEXT:    pand %xmm0, %xmm2
+; X32-SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSSE3-NEXT:    pshufb %xmm2, %xmm4
+; X32-SSSE3-NEXT:    psrlw $4, %xmm1
+; X32-SSSE3-NEXT:    pand %xmm0, %xmm1
+; X32-SSSE3-NEXT:    pshufb %xmm1, %xmm3
+; X32-SSSE3-NEXT:    paddb %xmm4, %xmm3
+; X32-SSSE3-NEXT:    pxor %xmm0, %xmm0
+; X32-SSSE3-NEXT:    psadbw %xmm3, %xmm0
+; X32-SSSE3-NEXT:    movd %xmm0, %eax
+; X32-SSSE3-NEXT:    xorl %edx, %edx
+; X32-SSSE3-NEXT:    retl
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+}
+
+define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
+; X32-NOSSE-LABEL: cnt128_pgso:
+; X32-NOSSE:       # %bb.0:
+; X32-NOSSE-NEXT:    pushl %ebp
+; X32-NOSSE-NEXT:    pushl %ebx
+; X32-NOSSE-NEXT:    pushl %edi
+; X32-NOSSE-NEXT:    pushl %esi
+; X32-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOSSE-NEXT:    movl %ebx, %ecx
+; X32-NOSSE-NEXT:    shrl %ecx
+; X32-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
+; X32-NOSSE-NEXT:    andl %edi, %ecx
+; X32-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
+; X32-NOSSE-NEXT:    subl %ecx, %ebx
+; X32-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
+; X32-NOSSE-NEXT:    movl %ebx, %ebp
+; X32-NOSSE-NEXT:    andl %ecx, %ebp
+; X32-NOSSE-NEXT:    shrl $2, %ebx
+; X32-NOSSE-NEXT:    andl %ecx, %ebx
+; X32-NOSSE-NEXT:    addl %ebp, %ebx
+; X32-NOSSE-NEXT:    movl %ebx, %ebp
+; X32-NOSSE-NEXT:    shrl $4, %ebp
+; X32-NOSSE-NEXT:    addl %ebx, %ebp
+; X32-NOSSE-NEXT:    movl %eax, %ebx
+; X32-NOSSE-NEXT:    shrl %ebx
+; X32-NOSSE-NEXT:    andl %edi, %ebx
+; X32-NOSSE-NEXT:    subl %ebx, %eax
+; X32-NOSSE-NEXT:    movl %eax, %ebx
+; X32-NOSSE-NEXT:    andl %ecx, %ebx
+; X32-NOSSE-NEXT:    shrl $2, %eax
+; X32-NOSSE-NEXT:    andl %ecx, %eax
+; X32-NOSSE-NEXT:    addl %ebx, %eax
+; X32-NOSSE-NEXT:    movl %eax, %edi
+; X32-NOSSE-NEXT:    shrl $4, %edi
+; X32-NOSSE-NEXT:    addl %eax, %edi
+; X32-NOSSE-NEXT:    movl $252645135, %ebx # imm = 0xF0F0F0F
+; X32-NOSSE-NEXT:    andl %ebx, %ebp
+; X32-NOSSE-NEXT:    imull $16843009, %ebp, %eax # imm = 0x1010101
+; X32-NOSSE-NEXT:    shrl $24, %eax
+; X32-NOSSE-NEXT:    andl %ebx, %edi
+; X32-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
+; X32-NOSSE-NEXT:    shrl $24, %edi
+; X32-NOSSE-NEXT:    addl %eax, %edi
+; X32-NOSSE-NEXT:    movl %esi, %eax
+; X32-NOSSE-NEXT:    shrl %eax
+; X32-NOSSE-NEXT:    movl $1431655765, %ebp # imm = 0x55555555
+; X32-NOSSE-NEXT:    andl %ebp, %eax
+; X32-NOSSE-NEXT:    subl %eax, %esi
+; X32-NOSSE-NEXT:    movl %esi, %eax
+; X32-NOSSE-NEXT:    andl %ecx, %eax
+; X32-NOSSE-NEXT:    shrl $2, %esi
+; X32-NOSSE-NEXT:    andl %ecx, %esi
+; X32-NOSSE-NEXT:    addl %eax, %esi
+; X32-NOSSE-NEXT:    movl %esi, %eax
+; X32-NOSSE-NEXT:    shrl $4, %eax
+; X32-NOSSE-NEXT:    addl %esi, %eax
+; X32-NOSSE-NEXT:    movl %edx, %esi
+; X32-NOSSE-NEXT:    shrl %esi
+; X32-NOSSE-NEXT:    andl %ebp, %esi
+; X32-NOSSE-NEXT:    subl %esi, %edx
+; X32-NOSSE-NEXT:    movl %edx, %esi
+; X32-NOSSE-NEXT:    andl %ecx, %esi
+; X32-NOSSE-NEXT:    shrl $2, %edx
+; X32-NOSSE-NEXT:    andl %ecx, %edx
+; X32-NOSSE-NEXT:    addl %esi, %edx
+; X32-NOSSE-NEXT:    movl %edx, %ecx
+; X32-NOSSE-NEXT:    shrl $4, %ecx
+; X32-NOSSE-NEXT:    addl %edx, %ecx
+; X32-NOSSE-NEXT:    andl %ebx, %eax
+; X32-NOSSE-NEXT:    andl %ebx, %ecx
+; X32-NOSSE-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; X32-NOSSE-NEXT:    shrl $24, %eax
+; X32-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
+; X32-NOSSE-NEXT:    shrl $24, %ecx
+; X32-NOSSE-NEXT:    addl %eax, %ecx
+; X32-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOSSE-NEXT:    addl %edi, %ecx
+; X32-NOSSE-NEXT:    xorl %edx, %edx
+; X32-NOSSE-NEXT:    movl %edx, 12(%eax)
+; X32-NOSSE-NEXT:    movl %edx, 8(%eax)
+; X32-NOSSE-NEXT:    movl %edx, 4(%eax)
+; X32-NOSSE-NEXT:    movl %ecx, (%eax)
+; X32-NOSSE-NEXT:    popl %esi
+; X32-NOSSE-NEXT:    popl %edi
+; X32-NOSSE-NEXT:    popl %ebx
+; X32-NOSSE-NEXT:    popl %ebp
+; X32-NOSSE-NEXT:    retl $4
+;
+; X64-LABEL: cnt128_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; X64-NEXT:    andq %r8, %rax
+; X64-NEXT:    subq %rax, %rsi
+; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    shrq $2, %rsi
+; X64-NEXT:    andq %rax, %rsi
+; X64-NEXT:    addq %rcx, %rsi
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    shrq $4, %rcx
+; X64-NEXT:    addq %rsi, %rcx
+; X64-NEXT:    movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    andq %r9, %rcx
+; X64-NEXT:    movabsq $72340172838076673, %rdx # imm = 0x101010101010101
+; X64-NEXT:    imulq %rdx, %rcx
+; X64-NEXT:    shrq $56, %rcx
+; X64-NEXT:    movq %rdi, %rsi
+; X64-NEXT:    shrq %rsi
+; X64-NEXT:    andq %r8, %rsi
+; X64-NEXT:    subq %rsi, %rdi
+; X64-NEXT:    movq %rdi, %rsi
+; X64-NEXT:    andq %rax, %rsi
+; X64-NEXT:    shrq $2, %rdi
+; X64-NEXT:    andq %rax, %rdi
+; X64-NEXT:    addq %rsi, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $4, %rax
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    andq %r9, %rax
+; X64-NEXT:    imulq %rdx, %rax
+; X64-NEXT:    shrq $56, %rax
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    retq
+;
+; X32-POPCNT-LABEL: cnt128_pgso:
+; X32-POPCNT:       # %bb.0:
+; X32-POPCNT-NEXT:    pushl %esi
+; X32-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
+; X32-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X32-POPCNT-NEXT:    addl %ecx, %edx
+; X32-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
+; X32-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X32-POPCNT-NEXT:    addl %ecx, %esi
+; X32-POPCNT-NEXT:    addl %edx, %esi
+; X32-POPCNT-NEXT:    xorl %ecx, %ecx
+; X32-POPCNT-NEXT:    movl %ecx, 12(%eax)
+; X32-POPCNT-NEXT:    movl %ecx, 8(%eax)
+; X32-POPCNT-NEXT:    movl %ecx, 4(%eax)
+; X32-POPCNT-NEXT:    movl %esi, (%eax)
+; X32-POPCNT-NEXT:    popl %esi
+; X32-POPCNT-NEXT:    retl $4
+;
+; X64-POPCNT-LABEL: cnt128_pgso:
+; X64-POPCNT:       # %bb.0:
+; X64-POPCNT-NEXT:    popcntq %rsi, %rcx
+; X64-POPCNT-NEXT:    popcntq %rdi, %rax
+; X64-POPCNT-NEXT:    addq %rcx, %rax
+; X64-POPCNT-NEXT:    xorl %edx, %edx
+; X64-POPCNT-NEXT:    retq
+;
+; X32-SSE2-LABEL: cnt128_pgso:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrlw $1, %xmm1
+; X32-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; X32-SSE2-NEXT:    pand %xmm2, %xmm1
+; X32-SSE2-NEXT:    psubb %xmm1, %xmm0
+; X32-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE2-NEXT:    pand %xmm1, %xmm3
+; X32-SSE2-NEXT:    psrlw $2, %xmm0
+; X32-SSE2-NEXT:    pand %xmm1, %xmm0
+; X32-SSE2-NEXT:    paddb %xmm3, %xmm0
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE2-NEXT:    psrlw $4, %xmm3
+; X32-SSE2-NEXT:    paddb %xmm0, %xmm3
+; X32-SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE2-NEXT:    pand %xmm0, %xmm3
+; X32-SSE2-NEXT:    pxor %xmm4, %xmm4
+; X32-SSE2-NEXT:    psadbw %xmm4, %xmm3
+; X32-SSE2-NEXT:    movd %xmm3, %ecx
+; X32-SSE2-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
+; X32-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X32-SSE2-NEXT:    psrlw $1, %xmm5
+; X32-SSE2-NEXT:    pand %xmm2, %xmm5
+; X32-SSE2-NEXT:    psubb %xmm5, %xmm3
+; X32-SSE2-NEXT:    movdqa %xmm3, %xmm2
+; X32-SSE2-NEXT:    pand %xmm1, %xmm2
+; X32-SSE2-NEXT:    psrlw $2, %xmm3
+; X32-SSE2-NEXT:    pand %xmm1, %xmm3
+; X32-SSE2-NEXT:    paddb %xmm2, %xmm3
+; X32-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X32-SSE2-NEXT:    psrlw $4, %xmm1
+; X32-SSE2-NEXT:    paddb %xmm3, %xmm1
+; X32-SSE2-NEXT:    pand %xmm0, %xmm1
+; X32-SSE2-NEXT:    psadbw %xmm4, %xmm1
+; X32-SSE2-NEXT:    movd %xmm1, %edx
+; X32-SSE2-NEXT:    addl %ecx, %edx
+; X32-SSE2-NEXT:    xorl %ecx, %ecx
+; X32-SSE2-NEXT:    movl %ecx, 12(%eax)
+; X32-SSE2-NEXT:    movl %ecx, 8(%eax)
+; X32-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X32-SSE2-NEXT:    movl %edx, (%eax)
+; X32-SSE2-NEXT:    retl $4
+;
+; X32-SSSE3-LABEL: cnt128_pgso:
+; X32-SSSE3:       # %bb.0:
+; X32-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X32-SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; X32-SSSE3-NEXT:    pand %xmm0, %xmm2
+; X32-SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSSE3-NEXT:    pshufb %xmm2, %xmm4
+; X32-SSSE3-NEXT:    psrlw $4, %xmm1
+; X32-SSSE3-NEXT:    pand %xmm0, %xmm1
+; X32-SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; X32-SSSE3-NEXT:    pshufb %xmm1, %xmm2
+; X32-SSSE3-NEXT:    paddb %xmm4, %xmm2
+; X32-SSSE3-NEXT:    pxor %xmm1, %xmm1
+; X32-SSSE3-NEXT:    psadbw %xmm1, %xmm2
+; X32-SSSE3-NEXT:    movd %xmm2, %ecx
+; X32-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; X32-SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; X32-SSSE3-NEXT:    pand %xmm0, %xmm4
+; X32-SSSE3-NEXT:    movdqa %xmm3, %xmm5
+; X32-SSSE3-NEXT:    pshufb %xmm4, %xmm5
+; X32-SSSE3-NEXT:    psrlw $4, %xmm2
+; X32-SSSE3-NEXT:    pand %xmm0, %xmm2
+; X32-SSSE3-NEXT:    pshufb %xmm2, %xmm3
+; X32-SSSE3-NEXT:    paddb %xmm5, %xmm3
+; X32-SSSE3-NEXT:    psadbw %xmm1, %xmm3
+; X32-SSSE3-NEXT:    movd %xmm3, %edx
+; X32-SSSE3-NEXT:    addl %ecx, %edx
+; X32-SSSE3-NEXT:    xorl %ecx, %ecx
+; X32-SSSE3-NEXT:    movl %ecx, 12(%eax)
+; X32-SSSE3-NEXT:    movl %ecx, 8(%eax)
+; X32-SSSE3-NEXT:    movl %ecx, 4(%eax)
+; X32-SSSE3-NEXT:    movl %edx, (%eax)
+; X32-SSSE3-NEXT:    retl $4
+  %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
+  ret i128 %cnt
+}
+
 declare i8 @llvm.ctpop.i8(i8) nounwind readnone
 declare i16 @llvm.ctpop.i16(i16) nounwind readnone
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 declare i128 @llvm.ctpop.i128(i128) nounwind readnone
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/pr27202.ll b/llvm/test/CodeGen/X86/pr27202.ll
--- a/llvm/test/CodeGen/X86/pr27202.ll
+++ b/llvm/test/CodeGen/X86/pr27202.ll
@@ -14,6 +14,19 @@
   ret i1 %cmp
 }
 
+define i1 @foo_pgso(i32 %i) !prof !14 {
+; CHECK-LABEL: foo_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $305419896, %eax # imm = 0x12345678
+; CHECK-NEXT:    andl %eax, %edi
+; CHECK-NEXT:    cmpl %eax, %edi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %and = and i32 %i, 305419896
+  %cmp = icmp eq i32 %and, 305419896
+  ret i1 %cmp
+}
+
 ; 8-bit ALU immediates probably have small encodings.
 ; We do not want to hoist the constant into a register here.
 
@@ -52,3 +65,20 @@
   %or4 = or i64 %or, %shl
   ret i64 %or4
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll
--- a/llvm/test/CodeGen/X86/splat-for-size.ll
+++ b/llvm/test/CodeGen/X86/splat-for-size.ll
@@ -417,6 +417,33 @@
   ret <8 x i64> %shuffle
 }
 
+define <8 x i64> @pr23259_pgso() !prof !14 {
+; AVX-LABEL: pr23259_pgso:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    movl    $1, %eax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: pr23259_pgso:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*}}(%rip), %ymm0
+; AVX2-NEXT:    movl    $1, %eax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,1,1]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
+; AVX2-NEXT:    retq
+entry:
+  %0 = load <4 x i64>, <4 x i64>* bitcast (<3 x i64>* @A to <4 x i64>*), align 32
+  %1 = shufflevector <4 x i64> %0, <4 x i64> undef, <3 x i32> <i32 undef, i32 undef, i32 2>
+  %shuffle = shufflevector <3 x i64> <i64 1, i64 undef, i64 undef>, <3 x i64> %1, <8 x i32> <i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
 attributes #0 = { optsize }
 attributes #1 = { minsize }
 
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -121,6 +121,38 @@
   br i1 %cmp26, label %for.body29, label %for.cond.cleanup28
 }
 
+define void @pr43371_pgso() !prof !14 {
+;
+; CHECK-LABEL: @pr43371_pgso
+; CHECK-NOT:   vector.scevcheck
+;
+; We do not want to generate SCEV predicates when optimising for size, because
+; that will lead to extra code generation such as the SCEV overflow runtime
+; checks. Not generating SCEV predicates can still result in vectorisation as
+; the non-consecutive loads/stores can be scalarized:
+;
+; CHECK: vector.body:
+; CHECK: store i16 0, i16* %{{.*}}, align 1
+; CHECK: store i16 0, i16* %{{.*}}, align 1
+; CHECK: br i1 {{.*}}, label %vector.body
+;
+entry:
+  br label %for.body29
+
+for.cond.cleanup28:
+  unreachable
+
+for.body29:
+  %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29]
+  %add33 = add i16 undef, %i24.0170
+  %idxprom34 = zext i16 %add33 to i32
+  %arrayidx35 = getelementptr [2592 x i16], [2592 x i16] * @cm_array, i32 0, i32 %idxprom34
+  store i16 0, i16 * %arrayidx35, align 1
+  %inc37 = add i16 %i24.0170, 1
+  %cmp26 = icmp ult i16 %inc37, 756
+  br i1 %cmp26, label %for.body29, label %for.cond.cleanup28
+}
+
 ; PR45526: don't vectorize with fold-tail if first-order-recurrence is live-out.
 ;
 define i32 @pr45526() optsize {
@@ -154,6 +186,37 @@
   ret i32 %for
 }
 
+define i32 @pr45526_pgso() !prof !14 {
+;
+; CHECK-LABEL: @pr45526_pgso
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br label %loop
+; CHECK-EMPTY:
+; CHECK-NEXT: loop:
+; CHECK-NEXT:   %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ]
+; CHECK-NEXT:   %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ]
+; CHECK-NEXT:   %pivPlus1 = add nuw nsw i32 %piv, 1
+; CHECK-NEXT:   %cond = icmp ult i32 %piv, 510
+; CHECK-NEXT:   br i1 %cond, label %loop, label %exit
+; CHECK-EMPTY:
+; CHECK-NEXT: exit:
+; CHECK-NEXT:   %for.lcssa = phi i32 [ %for, %loop ]
+; CHECK-NEXT:   ret i32 %for.lcssa
+;
+entry:
+  br label %loop
+
+loop:
+  %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ]
+  %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ]
+  %pivPlus1 = add nuw nsw i32 %piv, 1
+  %cond = icmp ult i32 %piv, 510
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret i32 %for
+}
+
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"ProfileSummary", !1}
 !1 = !{!2, !3, !4, !5, !6, !7, !8, !9}