Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -95,11 +95,7 @@ bool enableInterleavedAccessVectorization() { return true; } - bool shouldFavorBackedgeIndex(const Loop *L) const { - if (L->getHeader()->getParent()->hasOptSize()) - return false; - return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1; - } + bool shouldFavorBackedgeIndex(const Loop *L) const; /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -64,6 +64,31 @@ return MatchExact && MatchSubset; } +// Check if the loop contains any vector code, in which case we modify unroll +// and favor backedge behaviour. +static bool containsVectors(const Loop *L) { + for (auto *BB : L->getBlocks()) { + for (auto &I : *BB) { + if (I.getType()->isVectorTy()) + return true; + for(auto &Op : I.operands()) { + if (Op->getType()->isVectorTy()) + return true; + } + } + } + + return false; +} + +bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const { + if (L->getHeader()->getParent()->hasOptSize()) + return false; + if (ST->hasMVEIntegerOps() && containsVectors(L)) + return false; + return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1; +} + int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); @@ -1228,6 +1253,11 @@ if (ST->hasBranchPredictor() && L->getNumBlocks() > 4) return; + // Don't unroll vectorised loop. MVE does not benefit from it as much as + // scalar code. + if (ST->hasMVEIntegerOps() && containsVectors(L)) + return; + // Scan the loop: don't unroll loops with calls as this could prevent // inlining. unsigned Cost = 0; @@ -1241,10 +1271,6 @@ } return; } - // Don't unroll vectorised loop. MVE does not benefit from it as much as - // scalar code. - if (I.getType()->isVectorTy()) - return; SmallVector Operands(I.value_op_begin(), I.value_op_end()); Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -104,17 +104,17 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: sub.w r7, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #16 -; CHECK-NEXT: sub.w r5, r1, #16 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_12: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r5, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r4, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r5], #16 +; CHECK-NEXT: vldrw.u32 q1, [r4], #16 ; CHECK-NEXT: vmul.f32 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r6, #16]! +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB0_12 ; CHECK-NEXT: @ %bb.13: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -337,17 +337,17 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: sub.w r7, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #16 -; CHECK-NEXT: sub.w r5, r1, #16 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_12: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r5, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r4, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r5], #16 +; CHECK-NEXT: vldrw.u32 q1, [r4], #16 ; CHECK-NEXT: vadd.f32 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r6, #16]! +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB1_12 ; CHECK-NEXT: @ %bb.13: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -570,17 +570,17 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: sub.w r7, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #16 -; CHECK-NEXT: sub.w r5, r1, #16 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_12: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r5, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r4, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r5], #16 +; CHECK-NEXT: vldrw.u32 q1, [r4], #16 ; CHECK-NEXT: vsub.f32 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r6, #16]! +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB2_12 ; CHECK-NEXT: @ %bb.13: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -720,18 +720,18 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: sub.w r7, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #16 -; CHECK-NEXT: sub.w r5, r1, #16 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r5, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r4, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r5], #16 +; CHECK-NEXT: vldrw.u32 q1, [r4], #16 ; CHECK-NEXT: vcvt.f32.s32 q0, q0 ; CHECK-NEXT: vmul.f32 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r6, #16]! +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB3_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -942,18 +942,18 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #16 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: sub.w r5, r1, #16 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r4, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r5, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r4], #16 +; CHECK-NEXT: vldrw.u32 q1, [r5], #16 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vcvt.f32.s32 q0, q0 -; CHECK-NEXT: vstrb.8 q0, [r6, #16]! +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB4_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1037,7 +1037,7 @@ define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: half_half_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -1050,21 +1050,23 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #8 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: sub.w r5, r1, #8 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r8, [r5, #8]! -; CHECK-NEXT: ldr r7, [r4, #8]! -; CHECK-NEXT: vmov.32 q1[0], r8 -; CHECK-NEXT: vmov.32 q0[0], r7 -; CHECK-NEXT: ldr r7, [r5, #4] +; CHECK-NEXT: ldr.w r9, [r4] +; CHECK-NEXT: ldr r7, [r5] ; CHECK-NEXT: ldr.w r8, [r4, #4] -; CHECK-NEXT: vmov.32 q1[1], r7 +; CHECK-NEXT: vmov.32 q0[0], r9 +; CHECK-NEXT: ldr.w r10, [r5, #4] +; CHECK-NEXT: vmov.32 q1[0], r7 ; CHECK-NEXT: vmov.32 q0[1], r8 +; CHECK-NEXT: adds r4, #8 +; CHECK-NEXT: vmov.32 q1[1], r10 +; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vmul.f16 q0, q0, q1 ; CHECK-NEXT: vmovx.f16 s6, s1 ; CHECK-NEXT: vmovx.f16 s4, s0 @@ -1072,7 +1074,7 @@ ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6, #16]! +; CHECK-NEXT: vstrb.8 q2, [r6], #16 ; CHECK-NEXT: le lr, .LBB5_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1096,7 +1098,7 @@ ; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: le lr, .LBB5_7 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader @@ -1155,7 +1157,7 @@ define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: half_half_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -1168,21 +1170,23 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #8 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: sub.w r5, r1, #8 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r8, [r5, #8]! -; CHECK-NEXT: ldr r7, [r4, #8]! -; CHECK-NEXT: vmov.32 q1[0], r8 -; CHECK-NEXT: vmov.32 q0[0], r7 -; CHECK-NEXT: ldr r7, [r5, #4] +; CHECK-NEXT: ldr.w r9, [r4] +; CHECK-NEXT: ldr r7, [r5] ; CHECK-NEXT: ldr.w r8, [r4, #4] -; CHECK-NEXT: vmov.32 q1[1], r7 +; CHECK-NEXT: vmov.32 q0[0], r9 +; CHECK-NEXT: ldr.w r10, [r5, #4] +; CHECK-NEXT: vmov.32 q1[0], r7 ; CHECK-NEXT: vmov.32 q0[1], r8 +; CHECK-NEXT: adds r4, #8 +; CHECK-NEXT: vmov.32 q1[1], r10 +; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmovx.f16 s6, s1 ; CHECK-NEXT: vmovx.f16 s4, s0 @@ -1190,7 +1194,7 @@ ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6, #16]! +; CHECK-NEXT: vstrb.8 q2, [r6], #16 ; CHECK-NEXT: le lr, .LBB6_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1214,7 +1218,7 @@ ; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: le lr, .LBB6_7 ; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader @@ -1273,7 +1277,7 @@ define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: half_half_sub: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB7_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -1286,21 +1290,23 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #8 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: sub.w r5, r1, #8 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r8, [r5, #8]! -; CHECK-NEXT: ldr r7, [r4, #8]! -; CHECK-NEXT: vmov.32 q1[0], r8 -; CHECK-NEXT: vmov.32 q0[0], r7 -; CHECK-NEXT: ldr r7, [r5, #4] +; CHECK-NEXT: ldr.w r9, [r4] +; CHECK-NEXT: ldr r7, [r5] ; CHECK-NEXT: ldr.w r8, [r4, #4] -; CHECK-NEXT: vmov.32 q1[1], r7 +; CHECK-NEXT: vmov.32 q0[0], r9 +; CHECK-NEXT: ldr.w r10, [r5, #4] +; CHECK-NEXT: vmov.32 q1[0], r7 ; CHECK-NEXT: vmov.32 q0[1], r8 +; CHECK-NEXT: adds r4, #8 +; CHECK-NEXT: vmov.32 q1[1], r10 +; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vsub.f16 q0, q0, q1 ; CHECK-NEXT: vmovx.f16 s6, s1 ; CHECK-NEXT: vmovx.f16 s4, s0 @@ -1308,7 +1314,7 @@ ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6, #16]! +; CHECK-NEXT: vstrb.8 q2, [r6], #16 ; CHECK-NEXT: le lr, .LBB7_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1332,7 +1338,7 @@ ; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader @@ -1391,7 +1397,7 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: half_short_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB8_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -1404,15 +1410,17 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #8 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: sub.w r5, r1, #8 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r8, [r4, #8]! -; CHECK-NEXT: vldrh.u32 q0, [r5, #8]! +; CHECK-NEXT: ldr.w r9, [r4] +; CHECK-NEXT: ldr.w r8, [r4, #4] +; CHECK-NEXT: vldrh.u32 q0, [r5], #8 +; CHECK-NEXT: adds r4, #8 ; CHECK-NEXT: vmov r7, s0 ; CHECK-NEXT: vmov.16 q1[0], r7 ; CHECK-NEXT: vmov r7, s1 @@ -1421,10 +1429,9 @@ ; CHECK-NEXT: vmov.16 q1[2], r7 ; CHECK-NEXT: vmov r7, s3 ; CHECK-NEXT: vmov.16 q1[3], r7 -; CHECK-NEXT: ldr r7, [r4, #4] ; CHECK-NEXT: vcvt.f16.s16 q0, q1 -; CHECK-NEXT: vmov.32 q1[0], r8 -; CHECK-NEXT: vmov.32 q1[1], r7 +; CHECK-NEXT: vmov.32 q1[0], r9 +; CHECK-NEXT: vmov.32 q1[1], r8 ; CHECK-NEXT: vmul.f16 q0, q1, q0 ; CHECK-NEXT: vmovx.f16 s6, s1 ; CHECK-NEXT: vmovx.f16 s4, s0 @@ -1432,7 +1439,7 @@ ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6, #16]! +; CHECK-NEXT: vstrb.8 q2, [r6], #16 ; CHECK-NEXT: le lr, .LBB8_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1457,7 +1464,7 @@ ; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: le lr, .LBB8_7 ; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader Index: llvm/test/CodeGen/Thumb2/mve-vmla.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmla.ll +++ llvm/test/CodeGen/Thumb2/mve-vmla.ll @@ -82,15 +82,13 @@ define void @vmla32_in_loop(i32* %s1, i32 %x, i32* %d, i32 %n) { ; CHECK-LABEL: vmla32_in_loop: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #16 -; CHECK-NEXT: subs r2, #16 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r2, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmla.u32 q1, q0, r1 -; CHECK-NEXT: vstrw.32 q1, [r2] +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -122,15 +120,13 @@ define void @vmla16_in_loop(i16* %s1, i16 %x, i16* %d, i32 %n) { ; CHECK-LABEL: vmla16_in_loop: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #16 -; CHECK-NEXT: subs r2, #16 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q0, [r0, #16]! -; CHECK-NEXT: vldrh.u16 q1, [r2, #16]! +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vmla.u16 q1, q0, r1 -; CHECK-NEXT: vstrh.16 q1, [r2] +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -162,15 +158,13 @@ define void @vmla8_in_loop(i8* %s1, i8 %x, i8* %d, i32 %n) { ; CHECK-LABEL: vmla8_in_loop: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #16 -; CHECK-NEXT: subs r2, #16 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q0, [r0, #16]! -; CHECK-NEXT: vldrh.u16 q1, [r2, #16]! +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: subs r3, #16 ; CHECK-NEXT: vmla.u8 q1, q0, r1 -; CHECK-NEXT: vstrh.16 q1, [r2] +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr