diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -43,6 +43,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -60,6 +61,7 @@
 class MachinePipeliner : public MachineFunctionPass {
 public:
   MachineFunction *MF = nullptr;
+  MachineOptimizationRemarkEmitter *ORE = nullptr;
   const MachineLoopInfo *MLI = nullptr;
   const MachineDominatorTree *MDT = nullptr;
   const InstrItineraryData *InstrItins;
@@ -96,6 +98,7 @@
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<MachineDominatorTree>();
     AU.addRequired<LiveIntervals>();
+    AU.addRequired<MachineOptimizationRemarkEmitterPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -217,6 +217,7 @@
   MF = &mf;
   MLI = &getAnalysis<MachineLoopInfo>();
   MDT = &getAnalysis<MachineDominatorTree>();
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
   TII = MF->getSubtarget().getInstrInfo();
   RegClassInfo.runOnMachineFunction(*MF);
 
@@ -248,6 +249,12 @@
   setPragmaPipelineOptions(L);
   if (!canPipelineLoop(L)) {
     LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
+                                             L.getStartLoc(), L.getHeader())
+             << "Failed to pipeline loop";
+    });
+
     return Changed;
   }
 
@@ -309,11 +316,24 @@
 /// restricted to loops with a single basic block.  Make sure that the
 /// branch in the loop can be analyzed.
 bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
-  if (L.getNumBlocks() != 1)
+  if (L.getNumBlocks() != 1) {
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "Not a single basic block: "
+             << ore::NV("NumBlocks", L.getNumBlocks());
+    });
     return false;
+  }
 
-  if (disabledByPragma)
+  if (disabledByPragma) {
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "Disabled by Pragma.";
+    });
     return false;
+  }
 
   // Check if the branch can't be understood because we can't do pipelining
   // if that's the case.
@@ -321,25 +341,37 @@
   LI.FBB = nullptr;
   LI.BrCond.clear();
   if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
-    LLVM_DEBUG(
-        dbgs() << "Unable to analyzeBranch, can NOT pipeline current Loop\n");
+    LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
     NumFailBranch++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "The branch can't be understood";
+    });
     return false;
   }
 
   LI.LoopInductionVar = nullptr;
   LI.LoopCompare = nullptr;
   if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
-    LLVM_DEBUG(
-        dbgs() << "Unable to analyzeLoop, can NOT pipeline current Loop\n");
+    LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
     NumFailLoop++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "The loop structure is not supported";
+    });
     return false;
   }
 
   if (!L.getLoopPreheader()) {
-    LLVM_DEBUG(
-        dbgs() << "Preheader not found, can NOT pipeline current Loop\n");
+    LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
     NumFailPreheader++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "No loop preheader found";
+    });
     return false;
   }
 
@@ -457,10 +489,13 @@
 
   // Can't schedule a loop without a valid MII.
   if (MII == 0) {
-    LLVM_DEBUG(
-        dbgs()
-        << "0 is not a valid Minimal Initiation Interval, can NOT schedule\n");
+    LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
     NumFailZeroMII++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Invalid Minimal Initiation Interval: 0";
+    });
     return;
   }
 
@@ -469,6 +504,14 @@
     LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
                       << ", we don't pipleline large loops\n");
     NumFailLargeMaxMII++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Minimal Initiation Interval too large: "
+             << ore::NV("MII", (int)MII) << " > "
+             << ore::NV("SwpMaxMii", SwpMaxMii) << "."
+             << "Refer to -pipeliner-max-mii.";
+    });
     return;
   }
 
@@ -511,15 +554,24 @@
   if (!Scheduled){
     LLVM_DEBUG(dbgs() << "No schedule found, return\n");
     NumFailNoSchedule++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Unable to find schedule";
+    });
     return;
   }
 
   unsigned numStages = Schedule.getMaxStageCount();
   // No need to generate pipeline if there are no overlapped iterations.
   if (numStages == 0) {
-    LLVM_DEBUG(
-        dbgs() << "No overlapped iterations, no need to generate pipeline\n");
+    LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
     NumFailZeroStage++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "No need to pipeline - no overlapped iterations in schedule.";
+    });
     return;
   }
   // Check that the maximum stage count is less than user-defined limit.
@@ -527,9 +579,23 @@
     LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
                       << " : too many stages, abort\n");
     NumFailLargeMaxStage++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Too many stages in schedule: "
+             << ore::NV("numStages", (int)numStages) << " > "
+             << ore::NV("SwpMaxStages", SwpMaxStages)
+             << ". Refer to -pipeliner-max-stages.";
+    });
     return;
   }
 
+  Pass.ORE->emit([&]() {
+    return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
+                                     Loop.getHeader())
+           << "Pipelined succesfully!";
+  });
+
   // Generate the schedule as a ModuloSchedule.
   DenseMap<MachineInstr *, int> Cycles, Stages;
   std::vector<MachineInstr *> OrderedInsts;
@@ -1080,7 +1146,7 @@
     }
   }
   int Resmii = Resources.size();
-  LLVM_DEBUG(dbgs() << "Retrun Res MII:" << Resmii << "\n");
+  LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
   // Delete the memory for each of the DFAs that were created earlier.
   for (ResourceManager *RI : Resources) {
     ResourceManager *D = RI;
@@ -2052,9 +2118,16 @@
   LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II
                     << ")\n");
 
-  if (scheduleFound)
+  if (scheduleFound) {
     Schedule.finalizeSchedule(this);
-  else
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Schedule found with Initiation Interval: " << ore::NV("II", II)
+             << ", MaxStageCount: "
+             << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
+    });
+  } else
     Schedule.reset();
 
   return scheduleFound && Schedule.getMaxStageCount() > 0;
diff --git a/llvm/test/CodeGen/PowerPC/sms-remark.ll b/llvm/test/CodeGen/PowerPC/sms-remark.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/sms-remark.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:       -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr9 --ppc-enable-pipeliner \
+; RUN:       -pass-remarks-analysis=pipeliner -pass-remarks=pipeliner -o /dev/null 2>&1 \
+; RUN:       | FileCheck %s
+
+@x = dso_local local_unnamed_addr global <{ i32, i32, i32, i32, [1020 x i32] }> <{ i32 1, i32 2, i32 3, i32 4, [1020 x i32] zeroinitializer }>, align 4
+@y = dso_local global [1024 x i32] zeroinitializer, align 4
+
+define dso_local i32* @foo() local_unnamed_addr {
+;CHECK: Schedule found with Initiation Interval
+;CHECK: Pipelined succesfully!
+entry:
+  %.pre = load i32, i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0), align 4
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0)
+
+for.body:                                         ; preds = %for.body, %entry
+  %0 = phi i32 [ %.pre, %entry ], [ %add.2, %for.body ]
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next.2, %for.body ]
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %1, %1
+  %add = add nsw i32 %mul, %0
+  %arrayidx6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx2.1, align 4
+  %mul.1 = mul nsw i32 %2, %2
+  %add.1 = add nsw i32 %mul.1, %add
+  %arrayidx6.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next
+  store i32 %add.1, i32* %arrayidx6.1, align 4
+  %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx2.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next.1
+  %3 = load i32, i32* %arrayidx2.2, align 4
+  %mul.2 = mul nsw i32 %3, %3
+  %add.2 = add nsw i32 %mul.2, %add.1
+  %arrayidx6.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next.1
+  store i32 %add.2, i32* %arrayidx6.2, align 4
+  %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv, 3
+  %exitcond.2 = icmp eq i64 %indvars.iv.next.2, 1024
+  br i1 %exitcond.2, label %for.cond.cleanup, label %for.body
+}