diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -53,6 +53,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
@@ -1088,6 +1089,47 @@
   // Save loop properties before it is transformed.
   MDNode *OrigLoopID = L->getLoopID();
 
+  // Check if we should rotate the loop, when optimizing for size and we can
+  // fully unroll the loop. We do some additional checks to make sure the loop
+  // will be unroll-able after rotating, to avoid rotating without unrolling.
+  auto shouldRotate = [&](Loop *L) {
+    if (!L->getHeader()->getParent()->optForSize() || !TripCount ||
+        UP.Count != TripCount)
+      return false;
+
+    BasicBlock *Preheader = L->getLoopPreheader();
+    if (!Preheader)
+      return false;
+
+    BasicBlock *LatchBlock = L->getLoopLatch();
+    if (!LatchBlock)
+      return false;
+
+    // Loops with indirectbr cannot be cloned.
+    if (!L->isSafeToClone())
+      return false;
+
+    BasicBlock *Header = L->getHeader();
+    if (Header->hasAddressTaken())
+      return false;
+
+    BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+    if (!BI || BI->isUnconditional()) {
+      BranchInst *HeaderBI = dyn_cast<BranchInst>(Header->getTerminator());
+      auto CheckSuccessors = [&](unsigned S1, unsigned S2) {
+        return HeaderBI->getSuccessor(S1) == LatchBlock &&
+               !L->contains(HeaderBI->getSuccessor(S2));
+      };
+      return HeaderBI->isConditional() &&
+             (CheckSuccessors(0, 1) || CheckSuccessors(1, 0));
+    }
+    return false;
+  };
+
+  if (shouldRotate(L)) {
+    SimplifyQuery SQ(L->getHeader()->getParent()->getParent()->getDataLayout());
+    LoopRotation(L, LI, &TTI, &AC, &DT, &SE, nullptr, SQ, true, -1, true);
+  }
   // Unroll the loop.
   Loop *RemainderLoop = nullptr;
   LoopUnrollResult UnrollResult = UnrollLoop(
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/unroll-optsize.ll b/llvm/test/Transforms/LoopUnroll/AArch64/unroll-optsize.ll
--- a/llvm/test/Transforms/LoopUnroll/AArch64/unroll-optsize.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/unroll-optsize.ll
@@ -124,6 +124,99 @@
   ret void
 }
 
+; We need to rotate the loop in order to unroll it.
+define void @fully_unrolled_smaller_rotated() #0 {
+; CHECK-LABEL: @fully_unrolled_smaller_rotated(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARR:%.*]] = alloca [4 x i32], align 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[ARR]], i64 0, i64 0
+; CHECK-NEXT:    store i32 16, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[ARR]], i64 0, i64 1
+; CHECK-NEXT:    store i32 4104, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[ARR]], i64 0, i64 2
+; CHECK-NEXT:    store i32 1048592, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[ARR]], i64 0, i64 3
+; CHECK-NEXT:    store i32 268435480, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[PTR:%.*]] = bitcast [4 x i32]* [[ARR]] to i32*
+; CHECK-NEXT:    call void @use(i32* nonnull [[PTR]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arr = alloca [4 x i32], align 4
+  br label %for.cond
+
+for.cond:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %exitcond = icmp eq i64 %indvars.iv, 3
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv.tr = trunc i64 %indvars.iv to i32
+  %shl.0 = shl i32 %indvars.iv.tr, 3
+  %shl.1 = shl i32 16, %shl.0
+  %or = or i32 %shl.1, %shl.0
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 %indvars.iv
+  store i32 %or, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %ptr = bitcast [4 x i32]* %arr to i32*
+  call void @use(i32* nonnull %ptr) #4
+  ret void
+}
+
+define void @fully_unrolled_bigger_not_rotated() #0 {
+; CHECK-LABEL: @fully_unrolled_bigger_not_rotated(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARR:%.*]] = alloca [4 x i32], align 4
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 6
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV_TR:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[SHL_0:%.*]] = shl i32 [[INDVARS_IV_TR]], 3
+; CHECK-NEXT:    [[SHL_1:%.*]] = shl i32 16, [[SHL_0]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL_1]], [[SHL_0]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[ARR]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[OR]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[PTR:%.*]] = bitcast [4 x i32]* [[ARR]] to i32*
+; CHECK-NEXT:    call void @use(i32* nonnull [[PTR]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arr = alloca [4 x i32], align 4
+  br label %for.cond
+
+for.cond:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %exitcond = icmp eq i64 %indvars.iv, 6
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv.tr = trunc i64 %indvars.iv to i32
+  %shl.0 = shl i32 %indvars.iv.tr, 3
+  %shl.1 = shl i32 16, %shl.0
+  %or = or i32 %shl.1, %shl.0
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 %indvars.iv
+  store i32 %or, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %ptr = bitcast [4 x i32]* %arr to i32*
+  call void @use(i32* nonnull %ptr) #4
+  ret void
+}
+
+
 declare void @use(i32*)
 
 attributes #0 = { minsize optsize }