diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -53,6 +53,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/LoopRotationUtils.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" @@ -1088,6 +1089,47 @@ // Save loop properties before it is transformed. MDNode *OrigLoopID = L->getLoopID(); + // Check if we should rotate the loop, when optimizing for size and we can + // fully unroll the loop. We do some additional checks to make sure the loop + // will be unroll-able after rotating, to avoid rotating without unrolling. + auto shouldRotate = [&](Loop *L) { + if (!L->getHeader()->getParent()->optForSize() || !TripCount || + UP.Count != TripCount) + return false; + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + return false; + + BasicBlock *LatchBlock = L->getLoopLatch(); + if (!LatchBlock) + return false; + + // Loops with indirectbr cannot be cloned. + if (!L->isSafeToClone()) + return false; + + BasicBlock *Header = L->getHeader(); + if (Header->hasAddressTaken()) + return false; + + BranchInst *BI = dyn_cast(LatchBlock->getTerminator()); + if (!BI || BI->isUnconditional()) { + BranchInst *HeaderBI = dyn_cast(Header->getTerminator()); + auto CheckSuccessors = [&](unsigned S1, unsigned S2) { + return HeaderBI->getSuccessor(S1) == LatchBlock && + !L->contains(HeaderBI->getSuccessor(S2)); + }; + return HeaderBI->isConditional() && + (CheckSuccessors(0, 1) || CheckSuccessors(1, 0)); + } + return false; + }; + + if (shouldRotate(L)) { + SimplifyQuery SQ(L->getHeader()->getParent()->getParent()->getDataLayout()); + LoopRotation(L, LI, &TTI, &AC, &DT, &SE, nullptr, SQ, true, -1, true); + } // Unroll the loop. Loop *RemainderLoop = nullptr; LoopUnrollResult UnrollResult = UnrollLoop( diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/unroll-optsize.ll b/llvm/test/Transforms/LoopUnroll/AArch64/unroll-optsize.ll --- a/llvm/test/Transforms/LoopUnroll/AArch64/unroll-optsize.ll +++ b/llvm/test/Transforms/LoopUnroll/AArch64/unroll-optsize.ll @@ -124,6 +124,99 @@ ret void } +; We need to rotate the loop in order to unroll it. +define void @fully_unrolled_smaller_rotated() #0 { +; CHECK-LABEL: @fully_unrolled_smaller_rotated( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARR:%.*]] = alloca [4 x i32], align 4 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[ARR]], i64 0, i64 0 +; CHECK-NEXT: store i32 16, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[ARR]], i64 0, i64 1 +; CHECK-NEXT: store i32 4104, i32* [[ARRAYIDX_1]], align 4 +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[ARR]], i64 0, i64 2 +; CHECK-NEXT: store i32 1048592, i32* [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[ARR]], i64 0, i64 3 +; CHECK-NEXT: store i32 268435480, i32* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[PTR:%.*]] = bitcast [4 x i32]* [[ARR]] to i32* +; CHECK-NEXT: call void @use(i32* nonnull [[PTR]]) +; CHECK-NEXT: ret void +; +entry: + %arr = alloca [4 x i32], align 4 + br label %for.cond + +for.cond: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %exitcond = icmp eq i64 %indvars.iv, 3 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv.tr = trunc i64 %indvars.iv to i32 + %shl.0 = shl i32 %indvars.iv.tr, 3 + %shl.1 = shl i32 16, %shl.0 + %or = or i32 %shl.1, %shl.0 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 %indvars.iv + store i32 %or, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.cond.cleanup: ; preds = %for.cond + %ptr = bitcast [4 x i32]* %arr to i32* + call void @use(i32* nonnull %ptr) #4 + ret void +} + +define void @fully_unrolled_bigger_not_rotated() #0 { +; CHECK-LABEL: @fully_unrolled_bigger_not_rotated( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARR:%.*]] = alloca [4 x i32], align 4 +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 6 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV_TR:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[INDVARS_IV_TR]], 3 +; CHECK-NEXT: [[SHL_1:%.*]] = shl i32 16, [[SHL_0]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL_1]], [[SHL_0]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], [4 x i32]* [[ARR]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[OR]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: br label [[FOR_COND]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[PTR:%.*]] = bitcast [4 x i32]* [[ARR]] to i32* +; CHECK-NEXT: call void @use(i32* nonnull [[PTR]]) +; CHECK-NEXT: ret void +; +entry: + %arr = alloca [4 x i32], align 4 + br label %for.cond + +for.cond: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %exitcond = icmp eq i64 %indvars.iv, 6 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv.tr = trunc i64 %indvars.iv to i32 + %shl.0 = shl i32 %indvars.iv.tr, 3 + %shl.1 = shl i32 16, %shl.0 + %or = or i32 %shl.1, %shl.0 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 %indvars.iv + store i32 %or, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.cond.cleanup: ; preds = %for.cond + %ptr = bitcast [4 x i32]* %arr to i32* + call void @use(i32* nonnull %ptr) #4 + ret void +} + + declare void @use(i32*) attributes #0 = { minsize optsize }