Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1435,6 +1435,12 @@ return PrefLoopAlignment; } + /// Should loops be aligned even when the function is marked OptSize (but not + /// MinSize). + virtual bool alignLoopsWithOptSize() const { + return false; + } + /// If the target has a standard location for the stack protector guard, /// returns the address of that location. Otherwise, returns nullptr. /// DEPRECATED: please override useLoadStackGuardNode and customize Index: llvm/lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -2497,7 +2497,8 @@ // exclusively on the loop info here so that we can align backedges in // unnatural CFGs and backedges that were introduced purely because of the // loop rotations done during this layout pass. - if (F->getFunction().optForSize()) + if (F->getFunction().optForMinSize() || + (F->getFunction().optForSize() && !TLI->alignLoopsWithOptSize())) return; BlockChain &FunctionChain = *BlockToChain[&F->front()]; if (FunctionChain.begin() == FunctionChain.end()) Index: llvm/lib/Target/ARM/ARM.td =================================================================== --- llvm/lib/Target/ARM/ARM.td +++ llvm/lib/Target/ARM/ARM.td @@ -943,6 +943,7 @@ FeatureHasNoBranchPredictor]>; def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em, + ProcM3, FeatureVFP4, FeatureVFPOnlySP, FeatureD16, Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -575,6 +575,8 @@ bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL) const; + bool alignLoopsWithOptSize() const override; + /// Returns the number of interleaved accesses that will be generated when /// lowering accesses of the given type. unsigned getNumInterleavedAccesses(VectorType *VecTy, Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -238,6 +238,8 @@ : CallingConv::ARM_AAPCS); } + setPrefLoopAlignment(Subtarget->getPrefLoopAlignment()); + if (Subtarget->isTargetMachO()) { // Uses VFP for Thumb libfuncs if available. if (Subtarget->isThumb() && Subtarget->hasVFP2() && @@ -14695,6 +14697,11 @@ Addr}); } + +bool ARMTargetLowering::alignLoopsWithOptSize() const { + return Subtarget->isMClass(); +} + /// A helper function for determining the number of interleaved accesses we /// will generate when lowering accesses of the given type. unsigned Index: llvm/lib/Target/ARM/ARMSubtarget.h =================================================================== --- llvm/lib/Target/ARM/ARMSubtarget.h +++ llvm/lib/Target/ARM/ARMSubtarget.h @@ -438,6 +438,9 @@ /// operand cycle returned by the itinerary data for pre-ISel operands. int PreISelOperandLatencyAdjustment = 2; + /// What alignment is preferred for loop bodies, in log2(bytes). + unsigned PrefLoopAlignment = 0; + /// IsLittle - The target is Little Endian bool IsLittle; @@ -804,6 +807,10 @@ bool allowPositionIndependentMovt() const { return isROPI() || !isTargetELF(); } + + unsigned getPrefLoopAlignment() const { + return PrefLoopAlignment; + } }; } // end namespace llvm Index: llvm/lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- llvm/lib/Target/ARM/ARMSubtarget.cpp +++ llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -285,11 +285,15 @@ case CortexR4F: case CortexR5: case CortexR7: - case CortexM3: case CortexR52: case ExynosM1: case Kryo: break; + case CortexM3: + // According to the Technical Reference Manual, a branch to an unaligned + // 32-bit instruction incurs an extra penalty to the pipeline reload. + PrefLoopAlignment = 2; + break; case Krait: PreISelOperandLatencyAdjustment = 1; break; Index: llvm/test/CodeGen/ARM/loop-align-cortex-m.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/ARM/loop-align-cortex-m.ll @@ -0,0 +1,48 @@ +; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m3 -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m4 -o - | FileCheck %s + +define void @test_loop_alignment(i32* %in, i32* %out) optsize { +; CHECK-LABEL: test_loop_alignment: +; CHECK: movs {{r[0-9]+}}, #0 +; CHECK: .p2align 2 + +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %in.addr = getelementptr inbounds i32, i32* %in, i32 %i + %lhs = load i32, i32* %in.addr, align 4 + %res = mul nsw i32 %lhs, 5 + %out.addr = getelementptr inbounds i32, i32* %out, i32 %i + store i32 %res, i32* %out.addr, align 4 + %i.next = add i32 %i, 1 + %done = icmp eq i32 %i.next, 1024 + br i1 %done, label %end, label %loop + +end: + ret void +} + +define void @test_loop_alignment_minsize(i32* %in, i32* %out) minsize { +; CHECK-LABEL: test_loop_alignment_minsize: +; CHECK: movs {{r[0-9]+}}, #0 +; CHECK-NOT: .p2align + +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + %in.addr = getelementptr inbounds i32, i32* %in, i32 %i + %lhs = load i32, i32* %in.addr, align 4 + %res = mul nsw i32 %lhs, 5 + %out.addr = getelementptr inbounds i32, i32* %out, i32 %i + store i32 %res, i32* %out.addr, align 4 + %i.next = add i32 %i, 1 + %done = icmp eq i32 %i.next, 1024 + br i1 %done, label %end, label %loop + +end: + ret void +}