Index: lib/Target/ARM/ARMSubtarget.h =================================================================== --- lib/Target/ARM/ARMSubtarget.h +++ lib/Target/ARM/ARMSubtarget.h @@ -682,6 +682,8 @@ unsigned getMispredictionPenalty() const; + unsigned getIssueWidth() const; + /// This function returns true if the target has sincos() routine in its /// compiler runtime or math libraries. bool hasSinCos() const; Index: lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- lib/Target/ARM/ARMSubtarget.cpp +++ lib/Target/ARM/ARMSubtarget.cpp @@ -331,6 +331,10 @@ return SchedModel.MispredictPenalty; } +unsigned ARMSubtarget::getIssueWidth() const { + return getSchedModel().IssueWidth; +} + bool ARMSubtarget::hasSinCos() const { return isTargetWatchOS() || (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0)); Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -122,6 +122,8 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace); + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + bool shouldBuildLookupTablesForConstant(Constant *C) const { // In the ROPI and RWPI relocation models we can't have pointers to global // variables or functions in constant data, so don't convert switches to Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -544,3 +544,78 @@ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); } + +static Value *getTripCountValue(Loop *L) { + // Canonical loops will end with a 'cmp ne I, V', where I is the incremented + // canonical induction variable and V is the trip count of the loop. + PHINode *IV = L->getCanonicalInductionVariable(); + if (!IV) + return nullptr; + + BasicBlock *BackedgeBlock = L->getLoopLatch(); + Value *Inc = IV->getIncomingValueForBlock(BackedgeBlock); + + if (auto *BI = dyn_cast(BackedgeBlock->getTerminator())) { + if (BI->isConditional()) { + if (auto *ICI = dyn_cast(BI->getCondition())) { + if (ICI->getOperand(0) == Inc) { + return ICI->getOperand(1); + } + } + } + } + return nullptr; +} + +void ARMTTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { + if (!ST->isThumb2() || !ST->isMClass() || (L->getNumBlocks() != 1) + || !L->getExitingBlock() || !L->getUniqueExitBlock()) + return; + + BasicBlock *BB = L->getLoopLatch(); + UP.OptSizeThreshold = 0; + UP.PartialOptSizeThreshold = 0; + + // Scan the loop: don't unroll loops with calls. + for (auto &I : *BB) { + if (isa(I) || isa(I)) { + ImmutableCallSite CS(&I); + if (const Function *F = CS.getCalledFunction()) { + if (!isLoweredToCall(F)) + continue; + } + return; + } + } + + // Enable partial unrolling and set the initial threshold based upon + // the width of the microarchitecture. + UP.Partial = true; + UP.Threshold = ST->getIssueWidth() > 1 ? 300 : 150; + UP.PartialThreshold = ST->getIssueWidth() > 1 ? 300 : 150; + + // Enable runtime unrolling for non-nested loops. + if (L->getLoopDepth() == 1) { + UP.Runtime = true; + return; + } + + // For nested loops, we want the trip count to be invariant in the outer + // loops. + Value *TripCount = getTripCountValue(L); + if (!TripCount) + return; + + Loop *Parent = L->getParentLoop(); + while (Parent) { + if (!Parent->isLoopInvariant(TripCount)) + return; + Parent = Parent->getParentLoop(); + } + + // Lower the threshold for nested runtime loops. + if (!isa(TripCount)) + UP.PartialThreshold /= 2; + UP.Runtime = true; +} Index: test/CodeGen/ARM/loop-unrolling.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/loop-unrolling.ll @@ -0,0 +1,200 @@ +; RUN: opt -mcpu=cortex-m4 -loop-unroll -S %s -o - | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7m-arm-" + +; CHECK-LABEL: partial +define arm_aapcs_vfpcc void @partial(i32* nocapture %C, i32* nocapture readonly %A, i32* nocapture readonly %B) local_unnamed_addr #0 { +entry: + br label %for.body + +for.body: +; CHECK-LABEL: for.body +; CHECK: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV16:%[a-z.0-9]+]], %for.body ] +; CHECK: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1 +; CHECK: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1 +; CHECK: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1 +; CHECK: [[IV4:%[a-z.0-9]+]] = add nuw nsw i32 [[IV3]], 1 +; CHECK: [[IV5:%[a-z.0-9]+]] = add nuw nsw i32 [[IV4]], 1 +; CHECK: [[IV6:%[a-z.0-9]+]] = add nuw nsw i32 [[IV5]], 1 +; CHECK: [[IV7:%[a-z.0-9]+]] = add nuw nsw i32 [[IV6]], 1 +; CHECK: [[IV8:%[a-z.0-9]+]] = add nuw nsw i32 [[IV7]], 1 +; CHECK: [[IV9:%[a-z.0-9]+]] = add nuw nsw i32 [[IV8]], 1 +; CHECK: [[IV10:%[a-z.0-9]+]] = add nuw nsw i32 [[IV9]], 1 +; CHECK: [[IV11:%[a-z.0-9]+]] = add nuw nsw i32 [[IV10]], 1 +; CHECK: [[IV12:%[a-z.0-9]+]] = add nuw nsw i32 [[IV11]], 1 +; CHECK: [[IV13:%[a-z.0-9]+]] = add nuw nsw i32 [[IV12]], 1 +; CHECK: [[IV14:%[a-z.0-9]+]] = add nuw nsw i32 [[IV13]], 1 +; CHECK: [[IV15:%[a-z.0-9]+]] = add nuw nsw i32 [[IV14]], 1 +; CHECK: [[IV16:%[a-z.0-9]+]] = add nuw nsw i32 [[IV15]], 1 +; CHECK: [[CMP:%[a-z.0-9]+]] = icmp eq i32 [[IV16]], 1024 +; CHECK: br i1 [[CMP]], label [[END:%[a-z.]+]], label %for.body + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.08 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.08 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.08 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: runtime +define arm_aapcs_vfpcc void @runtime(i32* nocapture %C, i32* nocapture readonly %A, i32* nocapture readonly %B, i32 %N) local_unnamed_addr #0 { +entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %for.body + +for.body: +; CHECK-LABEL: for.body +; CHECK: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z.0-9]+]] ], [ [[IV8:%[a-z.0-9]+]], %for.body ] +; CHECK: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1 +; CHECK: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1 +; CHECK: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1 +; CHECK: [[IV4:%[a-z.0-9]+]] = add nuw nsw i32 [[IV3]], 1 +; CHECK: [[IV5:%[a-z.0-9]+]] = add nuw nsw i32 [[IV4]], 1 +; CHECK: [[IV6:%[a-z.0-9]+]] = add nuw nsw i32 [[IV5]], 1 +; CHECK: [[IV7:%[a-z.0-9]+]] = add nuw nsw i32 [[IV6]], 1 +; CHECK: [[IV8]] = add nuw i32 [[IV7]], 1 + %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.09 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.09 + %1 = load i32, i32* %arrayidx1, align 4 + %mul = mul nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.09 + store i32 %mul, i32* %arrayidx2, align 4 + %inc = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; CHECK-LABEL: nested_runtime +define arm_aapcs_vfpcc void @nested_runtime(i32* nocapture %C, i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) local_unnamed_addr #0 { +entry: + %cmp25 = icmp eq i32 %N, 0 + br i1 %cmp25, label %for.cond.cleanup, label %for.body4.lr.ph + +for.body4.lr.ph: ; preds = %entry, %for.cond.cleanup3 + %h.026 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %entry ] + %mul = mul i32 %h.026, %N + br label %for.body4 + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.cond.cleanup3: ; preds = %for.body4 + %inc11 = add nuw i32 %h.026, 1 + %exitcond27 = icmp eq i32 %inc11, %N + br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph + +; CHECK-LABEL: for.body4 +; CHECK: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z0-9.]+]] ], [ [[IV4:%[a-z.0-9]+]], %for.body4 ] +; CHECK: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1 +; CHECK: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1 +; CHECK: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1 +; CHECK: [[IV4:%[a-z.0-9]+]] = add nuw i32 [[IV3]], 1 +for.body4: ; preds = %for.body4, %for.body4.lr.ph + %w.024 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ] + %add = add i32 %w.024, %mul + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add + %0 = load i16, i16* %arrayidx, align 2 + %conv = sext i16 %0 to i32 + %arrayidx5 = getelementptr inbounds i16, i16* %B, i32 %w.024 + %1 = load i16, i16* %arrayidx5, align 2 + %conv6 = sext i16 %1 to i32 + %mul7 = mul nsw i32 %conv6, %conv + %arrayidx8 = getelementptr inbounds i32, i32* %C, i32 %w.024 + %2 = load i32, i32* %arrayidx8, align 4 + %add9 = add nsw i32 %mul7, %2 + store i32 %add9, i32* %arrayidx8, align 4 + %inc = add nuw i32 %w.024, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup3, label %for.body4 +} + +; CHECK-LABEL: loop_call +define arm_aapcs_vfpcc void @loop_call(i32* nocapture %C, i32* nocapture readonly %A, i32* nocapture readonly %B) local_unnamed_addr #1 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +; CHECK-LABEL: for.body +; CHECK: %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] +; CHECK: %inc = add nuw nsw i32 %i.08, 1 +; CHECK: %exitcond = icmp eq i32 %inc, 1024 +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.08 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.08 + %1 = load i32, i32* %arrayidx1, align 4 + %call = tail call arm_aapcs_vfpcc i32 @some_func(i32 %0, i32 %1) #3 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.08 + store i32 %call, i32* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: nested_runtime_variant +define arm_aapcs_vfpcc void @nested_runtime_variant(i32* nocapture %C, i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) local_unnamed_addr #0 { +entry: + %cmp27 = icmp eq i32 %N, 0 + br i1 %cmp27, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.body: ; preds = %entry, %for.cond.cleanup3 + %indvars.iv = phi i32 [ %indvars.iv.next, %for.cond.cleanup3 ], [ %N, %entry ] + %h.028 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %entry ] + %sub = sub i32 %N, %h.028 + %cmp225 = icmp eq i32 %sub, 0 + br i1 %cmp225, label %for.cond.cleanup3, label %for.body4.lr.ph + +for.body4.lr.ph: ; preds = %for.body + %mul = mul i32 %sub, %h.028 + br label %for.body4 + +for.cond.cleanup3: ; preds = %for.body4, %for.body + %inc11 = add nuw i32 %h.028, 1 + %indvars.iv.next = add i32 %indvars.iv, -1 + %exitcond29 = icmp eq i32 %inc11, %N + br i1 %exitcond29, label %for.cond.cleanup, label %for.body + +; CHECK-LABEL: for.body4 +; CHECK: %w.026 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ] +; CHECK: %inc = add nuw i32 %w.026, 1 +; CHECK: %exitcond = icmp eq i32 %inc, %indvars.iv +for.body4: ; preds = %for.body4, %for.body4.lr.ph + %w.026 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ] + %add = add i32 %w.026, %mul + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add + %0 = load i16, i16* %arrayidx, align 2 + %conv = sext i16 %0 to i32 + %arrayidx5 = getelementptr inbounds i16, i16* %B, i32 %w.026 + %1 = load i16, i16* %arrayidx5, align 2 + %conv6 = sext i16 %1 to i32 + %mul7 = mul nsw i32 %conv6, %conv + %arrayidx8 = getelementptr inbounds i32, i32* %C, i32 %w.026 + %2 = load i32, i32* %arrayidx8, align 4 + %add9 = add nsw i32 %mul7, %2 + store i32 %add9, i32* %arrayidx8, align 4 + %inc = add nuw i32 %w.026, 1 + %exitcond = icmp eq i32 %inc, %indvars.iv + br i1 %exitcond, label %for.cond.cleanup3, label %for.body4 +} + +declare arm_aapcs_vfpcc i32 @some_func(i32, i32) local_unnamed_addr #2