Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1435,6 +1435,12 @@
     return PrefLoopAlignment;
   }
 
+  /// Should loops be aligned even when the function is marked OptSize (but not
+  /// MinSize).
+  virtual bool alignLoopsWithOptSize() const {
+    return false;
+  }
+
   /// If the target has a standard location for the stack protector guard,
   /// returns the address of that location. Otherwise, returns nullptr.
   /// DEPRECATED: please override useLoadStackGuardNode and customize
Index: llvm/lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -2497,7 +2497,8 @@
   // exclusively on the loop info here so that we can align backedges in
   // unnatural CFGs and backedges that were introduced purely because of the
   // loop rotations done during this layout pass.
-  if (F->getFunction().optForSize())
+  if (F->getFunction().optForMinSize() ||
+      (F->getFunction().optForSize() && !TLI->alignLoopsWithOptSize()))
     return;
   BlockChain &FunctionChain = *BlockToChain[&F->front()];
   if (FunctionChain.begin() == FunctionChain.end())
Index: llvm/lib/Target/ARM/ARM.td
===================================================================
--- llvm/lib/Target/ARM/ARM.td
+++ llvm/lib/Target/ARM/ARM.td
@@ -943,6 +943,7 @@
                                                          FeatureHasNoBranchPredictor]>;
 
 def : ProcessorModel<"cortex-m4", CortexM3Model,        [ARMv7em,
+                                                         ProcM3,
                                                          FeatureVFP4,
                                                          FeatureVFPOnlySP,
                                                          FeatureD16,
Index: llvm/lib/Target/ARM/ARMISelLowering.h
===================================================================
--- llvm/lib/Target/ARM/ARMISelLowering.h
+++ llvm/lib/Target/ARM/ARMISelLowering.h
@@ -575,6 +575,8 @@
     bool isLegalInterleavedAccessType(VectorType *VecTy,
                                       const DataLayout &DL) const;
 
+    bool alignLoopsWithOptSize() const override;
+
     /// Returns the number of interleaved accesses that will be generated when
     /// lowering accesses of the given type.
     unsigned getNumInterleavedAccesses(VectorType *VecTy,
Index: llvm/lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -238,6 +238,8 @@
                                        : CallingConv::ARM_AAPCS);
   }
 
+  setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());
+
   if (Subtarget->isTargetMachO()) {
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
@@ -14695,6 +14697,11 @@
               Addr});
 }
 
+
+bool ARMTargetLowering::alignLoopsWithOptSize() const {
+  return Subtarget->isMClass();
+}
+
 /// A helper function for determining the number of interleaved accesses we
 /// will generate when lowering accesses of the given type.
 unsigned
Index: llvm/lib/Target/ARM/ARMSubtarget.h
===================================================================
--- llvm/lib/Target/ARM/ARMSubtarget.h
+++ llvm/lib/Target/ARM/ARMSubtarget.h
@@ -438,6 +438,9 @@
   /// operand cycle returned by the itinerary data for pre-ISel operands.
   int PreISelOperandLatencyAdjustment = 2;
 
+  /// What alignment is preferred for loop bodies, in log2(bytes).
+  unsigned PrefLoopAlignment = 0;
+
   /// IsLittle - The target is Little Endian
   bool IsLittle;
 
@@ -804,6 +807,10 @@
   bool allowPositionIndependentMovt() const {
     return isROPI() || !isTargetELF();
   }
+
+  unsigned getPrefLoopAlignment() const {
+    return PrefLoopAlignment;
+  }
 };
 
 } // end namespace llvm
Index: llvm/lib/Target/ARM/ARMSubtarget.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -285,11 +285,15 @@
   case CortexR4F:
   case CortexR5:
   case CortexR7:
-  case CortexM3:
   case CortexR52:
   case ExynosM1:
   case Kryo:
     break;
+  case CortexM3:
+    // According to the Technical Reference Manual, a branch to an unaligned
+    // 32-bit instruction incurs an extra penalty to the pipeline reload.
+    PrefLoopAlignment = 2;
+    break;
   case Krait:
     PreISelOperandLatencyAdjustment = 1;
     break;
Index: llvm/test/CodeGen/ARM/loop-align-cortex-m.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/loop-align-cortex-m.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m3 -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m4 -o - | FileCheck %s
+
+define void @test_loop_alignment(i32* %in, i32*  %out) optsize {
+; CHECK-LABEL: test_loop_alignment:
+; CHECK: movs {{r[0-9]+}}, #0
+; CHECK: .p2align 2
+
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %in.addr = getelementptr inbounds i32, i32* %in, i32 %i
+  %lhs = load i32, i32* %in.addr, align 4
+  %res = mul nsw i32 %lhs, 5
+  %out.addr = getelementptr inbounds i32, i32* %out, i32 %i
+  store i32 %res, i32* %out.addr, align 4
+  %i.next = add i32 %i, 1
+  %done = icmp eq i32 %i.next, 1024
+  br i1 %done, label %end, label %loop
+
+end:
+  ret void
+}
+
+define void @test_loop_alignment_minsize(i32* %in, i32*  %out) minsize {
+; CHECK-LABEL: test_loop_alignment_minsize:
+; CHECK: movs {{r[0-9]+}}, #0
+; CHECK-NOT: .p2align
+
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %in.addr = getelementptr inbounds i32, i32* %in, i32 %i
+  %lhs = load i32, i32* %in.addr, align 4
+  %res = mul nsw i32 %lhs, 5
+  %out.addr = getelementptr inbounds i32, i32* %out, i32 %i
+  store i32 %res, i32* %out.addr, align 4
+  %i.next = add i32 %i, 1
+  %done = icmp eq i32 %i.next, 1024
+  br i1 %done, label %end, label %loop
+
+end:
+  ret void
+}