Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -93,11 +93,8 @@ bool enableInterleavedAccessVectorization() { return true; } - bool shouldFavorBackedgeIndex(const Loop *L) const { - if (L->getHeader()->getParent()->hasOptSize()) - return false; - return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1; - } + bool shouldFavorBackedgeIndex(const Loop *L) const; + bool shouldFavorPostInc() const; /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -67,6 +67,20 @@ return MatchExact && MatchSubset; } +bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const { + if (L->getHeader()->getParent()->hasOptSize()) + return false; + if (ST->hasMVEIntegerOps()) + return false; + return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1; +} + +bool ARMTTIImpl::shouldFavorPostInc() const { + if (ST->hasMVEIntegerOps()) + return true; + return false; +} + int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -78,20 +78,18 @@ ; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r7, .LBB0_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader -; CHECK-NEXT: mvn r3, #3 +; CHECK-NEXT: add.w r1, r1, r12, lsl #2 +; CHECK-NEXT: add.w r2, r2, r12, lsl #2 +; CHECK-NEXT: add.w r0, r0, r12, lsl #2 ; CHECK-NEXT: mov lr, r7 -; CHECK-NEXT: add.w r3, r3, r12, lsl #2 -; CHECK-NEXT: add r1, r3 -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: add r0, r3 ; CHECK-NEXT: .LBB0_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s0, [r1, #4] +; CHECK-NEXT: vldr s0, [r1] ; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vldr s2, [r2, #4] +; CHECK-NEXT: vldr s2, [r2] ; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r0, #4] +; CHECK-NEXT: vstr s0, [r0] ; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: le lr, .LBB0_10 ; CHECK-NEXT: .LBB0_11: @ %for.cond.cleanup @@ -325,7 +323,7 @@ ; CHECK-NEXT: vdup.32 q2, r12 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: b .LBB2_4 -; CHECK-NEXT: .LBB2_2: @ %cond.load24 +; CHECK-NEXT: .LBB2_2: @ %cond.load25 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmovx.f16 s0, s28 ; CHECK-NEXT: vmov r4, s28 @@ -337,7 +335,7 @@ ; CHECK-NEXT: vmov.16 q6[2], r2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: .LBB2_3: @ %else25 +; CHECK-NEXT: .LBB2_3: @ %else26 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmul.f16 q5, q6, q5 ; CHECK-NEXT: sub.w lr, lr, #1 @@ -381,7 +379,7 @@ ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: lsls r4, r2, #30 ; CHECK-NEXT: bpl .LBB2_10 -; CHECK-NEXT: .LBB2_6: @ %cond.load5 +; CHECK-NEXT: .LBB2_6: @ %cond.load6 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vldr.16 s20, [r0, #2] ; CHECK-NEXT: vmov r5, s24 @@ -411,7 +409,7 @@ ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bpl .LBB2_7 -; CHECK-NEXT: .LBB2_11: @ %cond.load8 +; CHECK-NEXT: .LBB2_11: @ %cond.load9 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmovx.f16 s24, s20 ; CHECK-NEXT: vmov r4, s20 @@ -426,7 +424,7 @@ ; CHECK-NEXT: vmov.16 q6[3], r4 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bpl .LBB2_8 -; CHECK-NEXT: .LBB2_12: @ %cond.load11 +; CHECK-NEXT: .LBB2_12: @ %cond.load12 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmovx.f16 s20, s24 ; CHECK-NEXT: vmov r4, s24 @@ -438,7 +436,7 @@ ; CHECK-NEXT: vmov.16 q5[2], r2 ; CHECK-NEXT: vmov r2, s24 ; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: .LBB2_13: @ %else12 +; CHECK-NEXT: .LBB2_13: @ %else13 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vcmp.u32 cs, q2, q4 ; CHECK-NEXT: @ implicit-def: $q7 @@ -458,11 +456,11 @@ ; CHECK-NEXT: bfi r2, r4, #3, #1 ; CHECK-NEXT: lsls r4, r2, #31 ; CHECK-NEXT: bne .LBB2_17 -; CHECK-NEXT: @ %bb.14: @ %else16 +; CHECK-NEXT: @ %bb.14: @ %else17 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: lsls r4, r2, #30 ; CHECK-NEXT: bpl .LBB2_18 -; CHECK-NEXT: .LBB2_15: @ %cond.load18 +; CHECK-NEXT: .LBB2_15: @ %cond.load19 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vldr.16 s24, [r1, #2] ; CHECK-NEXT: vmov r5, s28 @@ -481,7 +479,7 @@ ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bmi.w .LBB2_2 ; CHECK-NEXT: b .LBB2_20 -; CHECK-NEXT: .LBB2_17: @ %cond.load15 +; CHECK-NEXT: .LBB2_17: @ %cond.load16 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vldr.16 s28, [r1] ; CHECK-NEXT: lsls r4, r2, #30 @@ -490,7 +488,7 @@ ; CHECK-NEXT: vmov q6, q7 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bpl .LBB2_16 -; CHECK-NEXT: .LBB2_19: @ %cond.load21 +; CHECK-NEXT: .LBB2_19: @ %cond.load22 ; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 ; CHECK-NEXT: vmovx.f16 s28, s24 ; CHECK-NEXT: vmov r4, s24 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -4,9 +4,9 @@ define arm_aapcs_vfpcc void @float_float_mul(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: float_float_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: beq.w .LBB0_10 +; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB0_3 @@ -34,91 +34,77 @@ ; CHECK-NEXT: beq .LBB0_11 ; CHECK-NEXT: .LBB0_4: @ %for.body.preheader22 ; CHECK-NEXT: mvn.w r7, r12 -; CHECK-NEXT: adds r5, r7, r3 +; CHECK-NEXT: adds r4, r7, r3 ; CHECK-NEXT: and lr, r3, #3 ; CHECK-NEXT: wls lr, lr, .LBB0_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader -; CHECK-NEXT: mvn r4, #3 -; CHECK-NEXT: add.w r7, r4, r12, lsl #2 -; CHECK-NEXT: adds r4, r0, r7 -; CHECK-NEXT: adds r6, r1, r7 -; CHECK-NEXT: add r7, r2 +; CHECK-NEXT: add.w r5, r0, r12, lsl #2 +; CHECK-NEXT: add.w r6, r1, r12, lsl #2 +; CHECK-NEXT: add.w r7, r2, r12, lsl #2 ; CHECK-NEXT: .LBB0_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s0, [r6, #4] +; CHECK-NEXT: vldr s0, [r6] ; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: vldr s2, [r4, #4] -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: vldr s2, [r5] +; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: add.w r12, r12, #1 ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r7, #4] +; CHECK-NEXT: vstr s0, [r7] ; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: le lr, .LBB0_6 ; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp r5, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r3, r12 -; CHECK-NEXT: sub.w r8, r0, #8 -; CHECK-NEXT: sub.w r10, r1, #8 -; CHECK-NEXT: sub.w r5, r2, #8 -; CHECK-NEXT: subs r0, #4 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: lsl.w r12, r12, #2 +; CHECK-NEXT: sub.w r3, r3, r12 +; CHECK-NEXT: lsl.w r4, r12, #2 ; CHECK-NEXT: .LBB0_9: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r4, r0, r12 -; CHECK-NEXT: add.w r6, r1, r12 -; CHECK-NEXT: add.w r9, r2, r12 -; CHECK-NEXT: add.w r7, r8, r12 -; CHECK-NEXT: vldr s0, [r6, #4] -; CHECK-NEXT: add.w r3, r10, r12 -; CHECK-NEXT: vldr s2, [r4, #4] -; CHECK-NEXT: add.w r11, r5, r12 -; CHECK-NEXT: add.w r8, r8, #16 -; CHECK-NEXT: add.w r10, r10, #16 -; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: adds r7, r1, r4 +; CHECK-NEXT: adds r6, r0, r4 +; CHECK-NEXT: adds r5, r2, r4 ; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldr s0, [r7] ; CHECK-NEXT: adds r1, #16 +; CHECK-NEXT: vldr s2, [r6] ; CHECK-NEXT: adds r2, #16 -; CHECK-NEXT: subs.w lr, lr, #4 -; CHECK-NEXT: vstr s0, [r9, #4] -; CHECK-NEXT: vldr s0, [r3, #12] -; CHECK-NEXT: vldr s2, [r7, #12] +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vmul.f32 s0, s2, s0 +; CHECK-NEXT: vstr s0, [r5] +; CHECK-NEXT: vldr s0, [r7, #4] +; CHECK-NEXT: vldr s2, [r6, #4] ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r11, #12] -; CHECK-NEXT: vldr s0, [r3, #16] -; CHECK-NEXT: vldr s2, [r7, #16] +; CHECK-NEXT: vstr s0, [r5, #4] +; CHECK-NEXT: vldr s0, [r7, #8] +; CHECK-NEXT: vldr s2, [r6, #8] ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r11, #16] -; CHECK-NEXT: vldr s0, [r6, #16] -; CHECK-NEXT: vldr s2, [r4, #16] +; CHECK-NEXT: vstr s0, [r5, #8] +; CHECK-NEXT: vldr s0, [r7, #12] +; CHECK-NEXT: vldr s2, [r6, #12] ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r9, #16] +; CHECK-NEXT: vstr s0, [r5, #12] ; CHECK-NEXT: bne .LBB0_9 ; CHECK-NEXT: .LBB0_10: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .LBB0_11: @ %vector.ph ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: sub.w r7, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #16 -; CHECK-NEXT: sub.w r5, r1, #16 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_12: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r5, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r4, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r5], #16 +; CHECK-NEXT: vldrw.u32 q1, [r4], #16 ; CHECK-NEXT: vmul.f32 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r6, #16]! +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB0_12 ; CHECK-NEXT: @ %bb.13: @ %middle.block ; CHECK-NEXT: cmp r12, r3 -; CHECK-NEXT: bne.w .LBB0_4 +; CHECK-NEXT: bne .LBB0_4 ; CHECK-NEXT: b .LBB0_10 entry: %cmp8 = icmp eq i32 %N, 0 @@ -237,9 +223,9 @@ define arm_aapcs_vfpcc void @float_float_add(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: float_float_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: beq.w .LBB1_10 +; CHECK-NEXT: beq .LBB1_10 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB1_3 @@ -267,91 +253,77 @@ ; CHECK-NEXT: beq .LBB1_11 ; CHECK-NEXT: .LBB1_4: @ %for.body.preheader22 ; CHECK-NEXT: mvn.w r7, r12 -; CHECK-NEXT: adds r5, r7, r3 +; CHECK-NEXT: adds r4, r7, r3 ; CHECK-NEXT: and lr, r3, #3 ; CHECK-NEXT: wls lr, lr, .LBB1_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader -; CHECK-NEXT: mvn r4, #3 -; CHECK-NEXT: add.w r7, r4, r12, lsl #2 -; CHECK-NEXT: adds r4, r0, r7 -; CHECK-NEXT: adds r6, r1, r7 -; CHECK-NEXT: add r7, r2 +; CHECK-NEXT: add.w r5, r0, r12, lsl #2 +; CHECK-NEXT: add.w r6, r1, r12, lsl #2 +; CHECK-NEXT: add.w r7, r2, r12, lsl #2 ; CHECK-NEXT: .LBB1_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s0, [r6, #4] +; CHECK-NEXT: vldr s0, [r6] ; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: vldr s2, [r4, #4] -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: vldr s2, [r5] +; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: add.w r12, r12, #1 ; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r7, #4] +; CHECK-NEXT: vstr s0, [r7] ; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: le lr, .LBB1_6 ; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp r5, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: blo .LBB1_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r3, r12 -; CHECK-NEXT: sub.w r8, r0, #8 -; CHECK-NEXT: sub.w r10, r1, #8 -; CHECK-NEXT: sub.w r5, r2, #8 -; CHECK-NEXT: subs r0, #4 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: lsl.w r12, r12, #2 +; CHECK-NEXT: sub.w r3, r3, r12 +; CHECK-NEXT: lsl.w r4, r12, #2 ; CHECK-NEXT: .LBB1_9: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r4, r0, r12 -; CHECK-NEXT: add.w r6, r1, r12 -; CHECK-NEXT: add.w r9, r2, r12 -; CHECK-NEXT: add.w r7, r8, r12 -; CHECK-NEXT: vldr s0, [r6, #4] -; CHECK-NEXT: add.w r3, r10, r12 -; CHECK-NEXT: vldr s2, [r4, #4] -; CHECK-NEXT: add.w r11, r5, r12 -; CHECK-NEXT: add.w r8, r8, #16 -; CHECK-NEXT: add.w r10, r10, #16 -; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: adds r7, r1, r4 +; CHECK-NEXT: adds r6, r0, r4 +; CHECK-NEXT: adds r5, r2, r4 ; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldr s0, [r7] ; CHECK-NEXT: adds r1, #16 +; CHECK-NEXT: vldr s2, [r6] ; CHECK-NEXT: adds r2, #16 -; CHECK-NEXT: subs.w lr, lr, #4 -; CHECK-NEXT: vstr s0, [r9, #4] -; CHECK-NEXT: vldr s0, [r3, #12] -; CHECK-NEXT: vldr s2, [r7, #12] +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NEXT: vstr s0, [r5] +; CHECK-NEXT: vldr s0, [r7, #4] +; CHECK-NEXT: vldr s2, [r6, #4] ; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r11, #12] -; CHECK-NEXT: vldr s0, [r3, #16] -; CHECK-NEXT: vldr s2, [r7, #16] +; CHECK-NEXT: vstr s0, [r5, #4] +; CHECK-NEXT: vldr s0, [r7, #8] +; CHECK-NEXT: vldr s2, [r6, #8] ; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r11, #16] -; CHECK-NEXT: vldr s0, [r6, #16] -; CHECK-NEXT: vldr s2, [r4, #16] +; CHECK-NEXT: vstr s0, [r5, #8] +; CHECK-NEXT: vldr s0, [r7, #12] +; CHECK-NEXT: vldr s2, [r6, #12] ; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r9, #16] +; CHECK-NEXT: vstr s0, [r5, #12] ; CHECK-NEXT: bne .LBB1_9 ; CHECK-NEXT: .LBB1_10: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .LBB1_11: @ %vector.ph ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: sub.w r7, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #16 -; CHECK-NEXT: sub.w r5, r1, #16 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_12: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r5, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r4, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r5], #16 +; CHECK-NEXT: vldrw.u32 q1, [r4], #16 ; CHECK-NEXT: vadd.f32 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r6, #16]! +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB1_12 ; CHECK-NEXT: @ %bb.13: @ %middle.block ; CHECK-NEXT: cmp r12, r3 -; CHECK-NEXT: bne.w .LBB1_4 +; CHECK-NEXT: bne .LBB1_4 ; CHECK-NEXT: b .LBB1_10 entry: %cmp8 = icmp eq i32 %N, 0 @@ -470,9 +442,9 @@ define arm_aapcs_vfpcc void @float_float_sub(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: float_float_sub: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: beq.w .LBB2_10 +; CHECK-NEXT: beq .LBB2_10 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB2_3 @@ -500,91 +472,77 @@ ; CHECK-NEXT: beq .LBB2_11 ; CHECK-NEXT: .LBB2_4: @ %for.body.preheader22 ; CHECK-NEXT: mvn.w r7, r12 -; CHECK-NEXT: adds r5, r7, r3 +; CHECK-NEXT: adds r4, r7, r3 ; CHECK-NEXT: and lr, r3, #3 ; CHECK-NEXT: wls lr, lr, .LBB2_7 ; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader -; CHECK-NEXT: mvn r4, #3 -; CHECK-NEXT: add.w r7, r4, r12, lsl #2 -; CHECK-NEXT: adds r4, r0, r7 -; CHECK-NEXT: adds r6, r1, r7 -; CHECK-NEXT: add r7, r2 +; CHECK-NEXT: add.w r5, r0, r12, lsl #2 +; CHECK-NEXT: add.w r6, r1, r12, lsl #2 +; CHECK-NEXT: add.w r7, r2, r12, lsl #2 ; CHECK-NEXT: .LBB2_6: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr s0, [r6, #4] +; CHECK-NEXT: vldr s0, [r6] ; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: vldr s2, [r4, #4] -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: vldr s2, [r5] +; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: add.w r12, r12, #1 ; CHECK-NEXT: vsub.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r7, #4] +; CHECK-NEXT: vstr s0, [r7] ; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: le lr, .LBB2_6 ; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp r5, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: blo .LBB2_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r3, r12 -; CHECK-NEXT: sub.w r8, r0, #8 -; CHECK-NEXT: sub.w r10, r1, #8 -; CHECK-NEXT: sub.w r5, r2, #8 -; CHECK-NEXT: subs r0, #4 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: lsl.w r12, r12, #2 +; CHECK-NEXT: sub.w r3, r3, r12 +; CHECK-NEXT: lsl.w r4, r12, #2 ; CHECK-NEXT: .LBB2_9: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r4, r0, r12 -; CHECK-NEXT: add.w r6, r1, r12 -; CHECK-NEXT: add.w r9, r2, r12 -; CHECK-NEXT: add.w r7, r8, r12 -; CHECK-NEXT: vldr s0, [r6, #4] -; CHECK-NEXT: add.w r3, r10, r12 -; CHECK-NEXT: vldr s2, [r4, #4] -; CHECK-NEXT: add.w r11, r5, r12 -; CHECK-NEXT: add.w r8, r8, #16 -; CHECK-NEXT: add.w r10, r10, #16 -; CHECK-NEXT: vsub.f32 s0, s2, s0 -; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: adds r7, r1, r4 +; CHECK-NEXT: adds r6, r0, r4 +; CHECK-NEXT: adds r5, r2, r4 ; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vldr s0, [r7] ; CHECK-NEXT: adds r1, #16 +; CHECK-NEXT: vldr s2, [r6] ; CHECK-NEXT: adds r2, #16 -; CHECK-NEXT: subs.w lr, lr, #4 -; CHECK-NEXT: vstr s0, [r9, #4] -; CHECK-NEXT: vldr s0, [r3, #12] -; CHECK-NEXT: vldr s2, [r7, #12] +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vsub.f32 s0, s2, s0 +; CHECK-NEXT: vstr s0, [r5] +; CHECK-NEXT: vldr s0, [r7, #4] +; CHECK-NEXT: vldr s2, [r6, #4] ; CHECK-NEXT: vsub.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r11, #12] -; CHECK-NEXT: vldr s0, [r3, #16] -; CHECK-NEXT: vldr s2, [r7, #16] +; CHECK-NEXT: vstr s0, [r5, #4] +; CHECK-NEXT: vldr s0, [r7, #8] +; CHECK-NEXT: vldr s2, [r6, #8] ; CHECK-NEXT: vsub.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r11, #16] -; CHECK-NEXT: vldr s0, [r6, #16] -; CHECK-NEXT: vldr s2, [r4, #16] +; CHECK-NEXT: vstr s0, [r5, #8] +; CHECK-NEXT: vldr s0, [r7, #12] +; CHECK-NEXT: vldr s2, [r6, #12] ; CHECK-NEXT: vsub.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r9, #16] +; CHECK-NEXT: vstr s0, [r5, #12] ; CHECK-NEXT: bne .LBB2_9 ; CHECK-NEXT: .LBB2_10: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .LBB2_11: @ %vector.ph ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: sub.w r7, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #16 -; CHECK-NEXT: sub.w r5, r1, #16 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_12: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r5, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r4, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r5], #16 +; CHECK-NEXT: vldrw.u32 q1, [r4], #16 ; CHECK-NEXT: vsub.f32 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r6, #16]! +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB2_12 ; CHECK-NEXT: @ %bb.13: @ %middle.block ; CHECK-NEXT: cmp r12, r3 -; CHECK-NEXT: bne.w .LBB2_4 +; CHECK-NEXT: bne .LBB2_4 ; CHECK-NEXT: b .LBB2_10 entry: %cmp8 = icmp eq i32 %N, 0 @@ -703,7 +661,7 @@ define arm_aapcs_vfpcc void @float_int_mul(float* nocapture readonly %a, i32* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: float_int_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB3_13 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -720,18 +678,18 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: sub.w r7, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #16 -; CHECK-NEXT: sub.w r5, r1, #16 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r5, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r4, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r5], #16 +; CHECK-NEXT: vldrw.u32 q1, [r4], #16 ; CHECK-NEXT: vcvt.f32.s32 q0, q0 ; CHECK-NEXT: vmul.f32 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r6, #16]! +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB3_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -740,77 +698,65 @@ ; CHECK-NEXT: .LBB3_6: ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader16 -; CHECK-NEXT: mvn.w r6, r12 -; CHECK-NEXT: adds r5, r6, r3 +; CHECK-NEXT: mvn.w r7, r12 +; CHECK-NEXT: add.w r8, r7, r3 ; CHECK-NEXT: and lr, r3, #3 ; CHECK-NEXT: wls lr, lr, .LBB3_10 ; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader -; CHECK-NEXT: mvn r4, #3 -; CHECK-NEXT: add.w r6, r4, r12, lsl #2 -; CHECK-NEXT: adds r4, r0, r6 -; CHECK-NEXT: add r6, r2 +; CHECK-NEXT: add.w r5, r0, r12, lsl #2 +; CHECK-NEXT: add.w r6, r1, r12, lsl #2 +; CHECK-NEXT: add.w r7, r2, r12, lsl #2 ; CHECK-NEXT: .LBB3_9: @ %for.body.prol ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r7, r1, r12, lsl #2 +; CHECK-NEXT: ldr r4, [r6], #4 ; CHECK-NEXT: add.w r12, r12, #1 -; CHECK-NEXT: vldr s0, [r7] +; CHECK-NEXT: vmov s0, r4 ; CHECK-NEXT: vcvt.f32.s32 s0, s0 -; CHECK-NEXT: vldr s2, [r4, #4] -; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: vldr s2, [r5] +; CHECK-NEXT: adds r5, #4 ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r6, #4] -; CHECK-NEXT: adds r6, #4 +; CHECK-NEXT: vstr s0, [r7] +; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: le lr, .LBB3_9 ; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit -; CHECK-NEXT: cmp r5, #3 +; CHECK-NEXT: cmp.w r8, #3 ; CHECK-NEXT: blo .LBB3_13 ; CHECK-NEXT: @ %bb.11: @ %for.body.preheader1 -; CHECK-NEXT: sub.w lr, r3, r12 -; CHECK-NEXT: sub.w r10, r0, #8 -; CHECK-NEXT: sub.w r4, r1, #8 -; CHECK-NEXT: sub.w r5, r2, #8 -; CHECK-NEXT: subs r0, #4 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: add.w r1, r1, r12, lsl #2 +; CHECK-NEXT: sub.w r3, r3, r12 +; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: lsl.w r12, r12, #2 ; CHECK-NEXT: .LBB3_12: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r3, r1, r12 -; CHECK-NEXT: add.w r11, r0, r12 -; CHECK-NEXT: add.w r9, r2, r12 -; CHECK-NEXT: add.w r6, r4, r12 -; CHECK-NEXT: vldr s0, [r3, #4] -; CHECK-NEXT: add.w r7, r10, r12 -; CHECK-NEXT: add.w r8, r5, r12 -; CHECK-NEXT: add.w r10, r10, #16 -; CHECK-NEXT: vcvt.f32.s32 s0, s0 -; CHECK-NEXT: vldr s2, [r11, #4] -; CHECK-NEXT: adds r4, #16 -; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: vldr s0, [r1, #-8] +; CHECK-NEXT: add.w r7, r0, r12 +; CHECK-NEXT: add.w r6, r2, r12 ; CHECK-NEXT: adds r0, #16 -; CHECK-NEXT: adds r1, #16 +; CHECK-NEXT: vcvt.f32.s32 s0, s0 +; CHECK-NEXT: vldr s2, [r7] ; CHECK-NEXT: adds r2, #16 -; CHECK-NEXT: subs.w lr, lr, #4 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r9, #4] -; CHECK-NEXT: vldr s0, [r6, #12] +; CHECK-NEXT: vstr s0, [r6] +; CHECK-NEXT: vldr s0, [r1, #-4] ; CHECK-NEXT: vcvt.f32.s32 s0, s0 -; CHECK-NEXT: vldr s2, [r7, #12] +; CHECK-NEXT: vldr s2, [r7, #4] ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r8, #12] -; CHECK-NEXT: vldr s2, [r6, #16] -; CHECK-NEXT: vldr s0, [r7, #16] -; CHECK-NEXT: vcvt.f32.s32 s2, s2 -; CHECK-NEXT: vmul.f32 s0, s0, s2 -; CHECK-NEXT: vstr s0, [r8, #16] -; CHECK-NEXT: vldr s0, [r3, #16] +; CHECK-NEXT: vstr s0, [r6, #4] +; CHECK-NEXT: vldr s0, [r1] ; CHECK-NEXT: vcvt.f32.s32 s0, s0 -; CHECK-NEXT: vldr s2, [r11, #16] +; CHECK-NEXT: vldr s2, [r7, #8] ; CHECK-NEXT: vmul.f32 s0, s2, s0 -; CHECK-NEXT: vstr s0, [r9, #16] +; CHECK-NEXT: vstr s0, [r6, #8] +; CHECK-NEXT: vldr s0, [r1, #4] +; CHECK-NEXT: add.w r1, r1, #16 +; CHECK-NEXT: vcvt.f32.s32 s0, s0 +; CHECK-NEXT: vldr s2, [r7, #12] +; CHECK-NEXT: vmul.f32 s0, s2, s0 +; CHECK-NEXT: vstr s0, [r6, #12] ; CHECK-NEXT: bne .LBB3_12 ; CHECK-NEXT: .LBB3_13: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader @@ -942,18 +888,18 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #16 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: sub.w r5, r1, #16 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r4, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r5, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r4], #16 +; CHECK-NEXT: vldrw.u32 q1, [r5], #16 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vcvt.f32.s32 q0, q0 -; CHECK-NEXT: vstrb.8 q0, [r6, #16]! +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB4_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -961,21 +907,18 @@ ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB4_6: @ %for.body.preheader11 ; CHECK-NEXT: sub.w lr, r3, r12 -; CHECK-NEXT: mvn r3, #3 -; CHECK-NEXT: add.w r3, r3, r12, lsl #2 +; CHECK-NEXT: add.w r0, r0, r12, lsl #2 +; CHECK-NEXT: add.w r1, r1, r12, lsl #2 +; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: add r0, r3 -; CHECK-NEXT: add r1, r3 -; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: .LBB4_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r3, [r0, #4]! -; CHECK-NEXT: ldr r6, [r1, #4]! +; CHECK-NEXT: ldr r3, [r0], #4 +; CHECK-NEXT: ldr r6, [r1], #4 ; CHECK-NEXT: muls r3, r6, r3 ; CHECK-NEXT: vmov s0, r3 ; CHECK-NEXT: vcvt.f32.s32 s0, s0 -; CHECK-NEXT: vstr s0, [r2, #4] -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: vstmia r2!, {s0} ; CHECK-NEXT: le lr, .LBB4_7 ; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -1037,7 +980,7 @@ define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: half_half_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -1050,21 +993,23 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #8 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: sub.w r5, r1, #8 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r8, [r5, #8]! -; CHECK-NEXT: ldr r7, [r4, #8]! -; CHECK-NEXT: vmov.32 q1[0], r8 -; CHECK-NEXT: vmov.32 q0[0], r7 -; CHECK-NEXT: ldr r7, [r5, #4] +; CHECK-NEXT: ldr.w r9, [r4] +; CHECK-NEXT: ldr r7, [r5] ; CHECK-NEXT: ldr.w r8, [r4, #4] -; CHECK-NEXT: vmov.32 q1[1], r7 +; CHECK-NEXT: vmov.32 q0[0], r9 +; CHECK-NEXT: ldr.w r10, [r5, #4] +; CHECK-NEXT: vmov.32 q1[0], r7 ; CHECK-NEXT: vmov.32 q0[1], r8 +; CHECK-NEXT: adds r4, #8 +; CHECK-NEXT: vmov.32 q1[1], r10 +; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vmul.f16 q0, q0, q1 ; CHECK-NEXT: vmovx.f16 s6, s1 ; CHECK-NEXT: vmovx.f16 s4, s0 @@ -1072,31 +1017,30 @@ ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6, #16]! +; CHECK-NEXT: vstrb.8 q2, [r6], #16 ; CHECK-NEXT: le lr, .LBB5_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader11 ; CHECK-NEXT: sub.w lr, r3, r12 -; CHECK-NEXT: sub.w r3, r12, #1 +; CHECK-NEXT: add.w r0, r0, r12, lsl #1 +; CHECK-NEXT: add.w r1, r1, r12, lsl #1 +; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: add.w r0, r0, r3, lsl #1 -; CHECK-NEXT: add.w r1, r1, r3, lsl #1 -; CHECK-NEXT: add.w r2, r2, r3, lsl #2 ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr.16 s0, [r1, #2] -; CHECK-NEXT: vldr.16 s2, [r0, #2] +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: vmul.f16 s0, s2, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstr s0, [r2, #4] +; CHECK-NEXT: vstr s0, [r2] ; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: le lr, .LBB5_7 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader @@ -1155,7 +1099,7 @@ define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: half_half_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -1168,21 +1112,23 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #8 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: sub.w r5, r1, #8 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r8, [r5, #8]! -; CHECK-NEXT: ldr r7, [r4, #8]! -; CHECK-NEXT: vmov.32 q1[0], r8 -; CHECK-NEXT: vmov.32 q0[0], r7 -; CHECK-NEXT: ldr r7, [r5, #4] +; CHECK-NEXT: ldr.w r9, [r4] +; CHECK-NEXT: ldr r7, [r5] ; CHECK-NEXT: ldr.w r8, [r4, #4] -; CHECK-NEXT: vmov.32 q1[1], r7 +; CHECK-NEXT: vmov.32 q0[0], r9 +; CHECK-NEXT: ldr.w r10, [r5, #4] +; CHECK-NEXT: vmov.32 q1[0], r7 ; CHECK-NEXT: vmov.32 q0[1], r8 +; CHECK-NEXT: adds r4, #8 +; CHECK-NEXT: vmov.32 q1[1], r10 +; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmovx.f16 s6, s1 ; CHECK-NEXT: vmovx.f16 s4, s0 @@ -1190,31 +1136,30 @@ ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6, #16]! +; CHECK-NEXT: vstrb.8 q2, [r6], #16 ; CHECK-NEXT: le lr, .LBB6_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: .LBB6_6: @ %for.body.preheader11 ; CHECK-NEXT: sub.w lr, r3, r12 -; CHECK-NEXT: sub.w r3, r12, #1 +; CHECK-NEXT: add.w r0, r0, r12, lsl #1 +; CHECK-NEXT: add.w r1, r1, r12, lsl #1 +; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: add.w r0, r0, r3, lsl #1 -; CHECK-NEXT: add.w r1, r1, r3, lsl #1 -; CHECK-NEXT: add.w r2, r2, r3, lsl #2 ; CHECK-NEXT: .LBB6_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr.16 s0, [r1, #2] -; CHECK-NEXT: vldr.16 s2, [r0, #2] +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: vadd.f16 s0, s2, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstr s0, [r2, #4] +; CHECK-NEXT: vstr s0, [r2] ; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: le lr, .LBB6_7 ; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader @@ -1273,7 +1218,7 @@ define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: half_half_sub: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB7_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -1286,21 +1231,23 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #8 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: sub.w r5, r1, #8 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r8, [r5, #8]! -; CHECK-NEXT: ldr r7, [r4, #8]! -; CHECK-NEXT: vmov.32 q1[0], r8 -; CHECK-NEXT: vmov.32 q0[0], r7 -; CHECK-NEXT: ldr r7, [r5, #4] +; CHECK-NEXT: ldr.w r9, [r4] +; CHECK-NEXT: ldr r7, [r5] ; CHECK-NEXT: ldr.w r8, [r4, #4] -; CHECK-NEXT: vmov.32 q1[1], r7 +; CHECK-NEXT: vmov.32 q0[0], r9 +; CHECK-NEXT: ldr.w r10, [r5, #4] +; CHECK-NEXT: vmov.32 q1[0], r7 ; CHECK-NEXT: vmov.32 q0[1], r8 +; CHECK-NEXT: adds r4, #8 +; CHECK-NEXT: vmov.32 q1[1], r10 +; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vsub.f16 q0, q0, q1 ; CHECK-NEXT: vmovx.f16 s6, s1 ; CHECK-NEXT: vmovx.f16 s4, s0 @@ -1308,31 +1255,30 @@ ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6, #16]! +; CHECK-NEXT: vstrb.8 q2, [r6], #16 ; CHECK-NEXT: le lr, .LBB7_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB7_8 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader11 ; CHECK-NEXT: sub.w lr, r3, r12 -; CHECK-NEXT: sub.w r3, r12, #1 +; CHECK-NEXT: add.w r0, r0, r12, lsl #1 +; CHECK-NEXT: add.w r1, r1, r12, lsl #1 +; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: add.w r0, r0, r3, lsl #1 -; CHECK-NEXT: add.w r1, r1, r3, lsl #1 -; CHECK-NEXT: add.w r2, r2, r3, lsl #2 ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr.16 s0, [r1, #2] -; CHECK-NEXT: vldr.16 s2, [r0, #2] +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: vsub.f16 s0, s2, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstr s0, [r2, #4] +; CHECK-NEXT: vstr s0, [r2] ; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader @@ -1391,7 +1337,7 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: half_short_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB8_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -1404,15 +1350,17 @@ ; CHECK-NEXT: bic r12, r3, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: sub.w r6, r12, #4 -; CHECK-NEXT: sub.w r4, r0, #8 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: sub.w r5, r1, #8 -; CHECK-NEXT: sub.w r6, r2, #16 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r8, [r4, #8]! -; CHECK-NEXT: vldrh.u32 q0, [r5, #8]! +; CHECK-NEXT: ldr.w r9, [r4] +; CHECK-NEXT: ldr.w r8, [r4, #4] +; CHECK-NEXT: vldrh.u32 q0, [r5], #8 +; CHECK-NEXT: adds r4, #8 ; CHECK-NEXT: vmov r7, s0 ; CHECK-NEXT: vmov.16 q1[0], r7 ; CHECK-NEXT: vmov r7, s1 @@ -1421,10 +1369,9 @@ ; CHECK-NEXT: vmov.16 q1[2], r7 ; CHECK-NEXT: vmov r7, s3 ; CHECK-NEXT: vmov.16 q1[3], r7 -; CHECK-NEXT: ldr r7, [r4, #4] ; CHECK-NEXT: vcvt.f16.s16 q0, q1 -; CHECK-NEXT: vmov.32 q1[0], r8 -; CHECK-NEXT: vmov.32 q1[1], r7 +; CHECK-NEXT: vmov.32 q1[0], r9 +; CHECK-NEXT: vmov.32 q1[1], r8 ; CHECK-NEXT: vmul.f16 q0, q1, q0 ; CHECK-NEXT: vmovx.f16 s6, s1 ; CHECK-NEXT: vmovx.f16 s4, s0 @@ -1432,32 +1379,31 @@ ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 ; CHECK-NEXT: vcvtb.f32.f16 s9, s4 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vstrb.8 q2, [r6, #16]! +; CHECK-NEXT: vstrb.8 q2, [r6], #16 ; CHECK-NEXT: le lr, .LBB8_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 ; CHECK-NEXT: beq .LBB8_8 ; CHECK-NEXT: .LBB8_6: @ %for.body.preheader13 ; CHECK-NEXT: sub.w lr, r3, r12 -; CHECK-NEXT: sub.w r3, r12, #1 +; CHECK-NEXT: add.w r0, r0, r12, lsl #1 +; CHECK-NEXT: add.w r1, r1, r12, lsl #1 +; CHECK-NEXT: add.w r2, r2, r12, lsl #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: add.w r0, r0, r3, lsl #1 -; CHECK-NEXT: add.w r1, r1, r3, lsl #1 -; CHECK-NEXT: add.w r2, r2, r3, lsl #2 ; CHECK-NEXT: .LBB8_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrsh r3, [r1, #2]! -; CHECK-NEXT: vldr.16 s0, [r0, #2] +; CHECK-NEXT: ldrsh r3, [r1], #2 +; CHECK-NEXT: vldr.16 s0, [r0] ; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: vmov s2, r3 ; CHECK-NEXT: vcvt.f16.s32 s2, s2 ; CHECK-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-NEXT: vstr s0, [r2, #4] +; CHECK-NEXT: vstr s0, [r2] ; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: le lr, .LBB8_7 ; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader @@ -1518,11 +1464,11 @@ define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: half_half_mac: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: cbz r2, .LBB9_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: subs r3, r2, #1 -; CHECK-NEXT: and r4, r2, #3 +; CHECK-NEXT: and r5, r2, #3 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhs .LBB9_4 ; CHECK-NEXT: @ %bb.2: @@ -1539,25 +1485,25 @@ ; CHECK-NEXT: vldr s0, .LCPI9_0 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 -; CHECK-NEXT: sub.w r3, r0, #8 -; CHECK-NEXT: sub.w r2, r1, #8 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr.16 s2, [r2, #14] -; CHECK-NEXT: vldr.16 s4, [r3, #14] -; CHECK-NEXT: vldr.16 s6, [r3, #12] -; CHECK-NEXT: vldr.16 s8, [r3, #10] +; CHECK-NEXT: adds r4, r0, r3 +; CHECK-NEXT: adds r2, r1, r3 +; CHECK-NEXT: vldr.16 s2, [r2, #6] +; CHECK-NEXT: vldr.16 s4, [r4, #6] +; CHECK-NEXT: vldr.16 s6, [r4, #4] +; CHECK-NEXT: vldr.16 s8, [r4, #2] ; CHECK-NEXT: vmul.f16 s2, s4, s2 -; CHECK-NEXT: vldr.16 s4, [r2, #12] -; CHECK-NEXT: vldr.16 s10, [r3, #8] +; CHECK-NEXT: vldr.16 s4, [r2, #4] +; CHECK-NEXT: vldr.16 s10, [r4] ; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vmul.f16 s4, s6, s4 -; CHECK-NEXT: vldr.16 s6, [r2, #10] +; CHECK-NEXT: vldr.16 s6, [r2, #2] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vmul.f16 s6, s8, s6 -; CHECK-NEXT: vldr.16 s8, [r2, #8] -; CHECK-NEXT: adds r2, #8 +; CHECK-NEXT: vldr.16 s8, [r2] ; CHECK-NEXT: vmul.f16 s8, s10, s8 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6 @@ -1569,17 +1515,15 @@ ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: le lr, .LBB9_5 ; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, r4, .LBB9_9 +; CHECK-NEXT: wls lr, r5, .LBB9_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader -; CHECK-NEXT: mvn r2, #1 -; CHECK-NEXT: mov lr, r4 -; CHECK-NEXT: add.w r2, r2, r12, lsl #1 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: add.w r0, r0, r12, lsl #1 +; CHECK-NEXT: add.w r1, r1, r12, lsl #1 +; CHECK-NEXT: mov lr, r5 ; CHECK-NEXT: .LBB9_8: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr.16 s2, [r1, #2] -; CHECK-NEXT: vldr.16 s4, [r0, #2] +; CHECK-NEXT: vldr.16 s2, [r1] +; CHECK-NEXT: vldr.16 s4, [r0] ; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: vmul.f16 s2, s4, s2 @@ -1587,7 +1531,7 @@ ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: le lr, .LBB9_8 ; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.10: ; CHECK-NEXT: .LCPI9_0: @@ -1677,11 +1621,11 @@ define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* nocapture readonly %b, i32 %N) { ; CHECK-LABEL: half_half_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: cbz r2, .LBB10_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: subs r3, r2, #1 -; CHECK-NEXT: and r4, r2, #3 +; CHECK-NEXT: and r5, r2, #3 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhs .LBB10_4 ; CHECK-NEXT: @ %bb.2: @@ -1698,25 +1642,25 @@ ; CHECK-NEXT: vldr s0, .LCPI10_0 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 -; CHECK-NEXT: sub.w r3, r0, #8 -; CHECK-NEXT: sub.w r2, r1, #8 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB10_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr.16 s2, [r2, #14] -; CHECK-NEXT: vldr.16 s4, [r3, #14] -; CHECK-NEXT: vldr.16 s6, [r3, #12] -; CHECK-NEXT: vldr.16 s8, [r3, #10] +; CHECK-NEXT: adds r4, r0, r3 +; CHECK-NEXT: adds r2, r1, r3 +; CHECK-NEXT: vldr.16 s2, [r2, #6] +; CHECK-NEXT: vldr.16 s4, [r4, #6] +; CHECK-NEXT: vldr.16 s6, [r4, #4] +; CHECK-NEXT: vldr.16 s8, [r4, #2] ; CHECK-NEXT: vadd.f16 s2, s4, s2 -; CHECK-NEXT: vldr.16 s4, [r2, #12] -; CHECK-NEXT: vldr.16 s10, [r3, #8] +; CHECK-NEXT: vldr.16 s4, [r2, #4] +; CHECK-NEXT: vldr.16 s10, [r4] ; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vadd.f16 s4, s6, s4 -; CHECK-NEXT: vldr.16 s6, [r2, #10] +; CHECK-NEXT: vldr.16 s6, [r2, #2] ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vadd.f16 s6, s8, s6 -; CHECK-NEXT: vldr.16 s8, [r2, #8] -; CHECK-NEXT: adds r2, #8 +; CHECK-NEXT: vldr.16 s8, [r2] ; CHECK-NEXT: vadd.f16 s8, s10, s8 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8 ; CHECK-NEXT: vcvtb.f32.f16 s6, s6 @@ -1728,17 +1672,15 @@ ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: le lr, .LBB10_5 ; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, r4, .LBB10_9 +; CHECK-NEXT: wls lr, r5, .LBB10_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader -; CHECK-NEXT: mvn r2, #1 -; CHECK-NEXT: mov lr, r4 -; CHECK-NEXT: add.w r2, r2, r12, lsl #1 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: add.w r0, r0, r12, lsl #1 +; CHECK-NEXT: add.w r1, r1, r12, lsl #1 +; CHECK-NEXT: mov lr, r5 ; CHECK-NEXT: .LBB10_8: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldr.16 s2, [r1, #2] -; CHECK-NEXT: vldr.16 s4, [r0, #2] +; CHECK-NEXT: vldr.16 s2, [r1] +; CHECK-NEXT: vldr.16 s4, [r0] ; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: vadd.f16 s2, s4, s2 @@ -1746,7 +1688,7 @@ ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: le lr, .LBB10_8 ; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.10: ; CHECK-NEXT: .LCPI10_0: @@ -1897,15 +1839,13 @@ ; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r7, .LBB11_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader -; CHECK-NEXT: mvn r3, #1 -; CHECK-NEXT: add.w r2, r3, r12, lsl #1 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: add.w r0, r0, r12, lsl #1 +; CHECK-NEXT: add.w r1, r1, r12, lsl #1 ; CHECK-NEXT: mov lr, r7 ; CHECK-NEXT: .LBB11_8: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrsh r2, [r1, #2]! -; CHECK-NEXT: vldr.16 s2, [r0, #2] +; CHECK-NEXT: ldrsh r2, [r1], #2 +; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: vmov s4, r2 ; CHECK-NEXT: vcvt.f16.s32 s4, s4 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -9,28 +9,18 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r12, r2 -; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u32 q2, [r2] -; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vldrb.u32 q2, [r1], #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -149,28 +139,18 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: adds r3, r2, #3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov r12, r2 -; CHECK-NEXT: adds r2, r1, r3 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u32 q2, [r2] -; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vldrb.u32 q2, [r1], #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -350,93 +330,91 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: ldr.w r12, [sp, #28] ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: beq.w .LBB5_12 +; CHECK-NEXT: beq.w .LBB5_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12 -; CHECK-NEXT: cmp r5, r1 -; CHECK-NEXT: add.w r4, r0, r12 +; CHECK-NEXT: add.w r4, r3, r12, lsl #2 +; CHECK-NEXT: add.w r5, r1, r12 +; CHECK-NEXT: cmp r4, r1 +; CHECK-NEXT: add.w r6, r0, r12 ; CHECK-NEXT: cset r7, hi -; CHECK-NEXT: cmp r6, r3 -; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: cmp r5, r0 +; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r3 +; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: ands r5, r4 -; CHECK-NEXT: lsls r5, r5, #31 +; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cset r6, hi +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: ands r6, r4 +; CHECK-NEXT: lsls r6, r6, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq r7, r6 -; CHECK-NEXT: lslseq.w r7, r7, #31 +; CHECK-NEXT: andeq.w r4, r5, r7 +; CHECK-NEXT: lslseq.w r4, r4, #31 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader ; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and lr, r12, #3 +; CHECK-NEXT: and r9, r12, #3 ; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: b .LBB5_9 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: b .LBB5_8 ; CHECK-NEXT: .LBB5_4: @ %vector.ph -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r5, r0, r4 -; CHECK-NEXT: vldrb.u32 q0, [r5] -; CHECK-NEXT: adds r5, r1, r4 -; CHECK-NEXT: vldrb.u32 q1, [r5] +; CHECK-NEXT: vldrb.u32 q0, [r0], #4 +; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB5_5 -; CHECK-NEXT: b .LBB5_12 +; CHECK-NEXT: b .LBB5_11 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r12, lr, r12 -; CHECK-NEXT: subs r4, r1, #3 -; CHECK-NEXT: subs r5, r0, #3 -; CHECK-NEXT: sub.w r7, r3, #16 -; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: bic r7, r12, #3 +; CHECK-NEXT: add.w r4, r3, #8 +; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: add.w lr, lr, r7, lsr #2 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrb.w r8, [r5, #3] -; CHECK-NEXT: sub.w r9, r9, #4 -; CHECK-NEXT: ldrb r6, [r4, #3] -; CHECK-NEXT: cmp r12, r9 -; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r7, #16]! -; CHECK-NEXT: ldrb r8, [r5, #4]! -; CHECK-NEXT: ldrb r6, [r4, #4]! -; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r7, #4] +; CHECK-NEXT: ldrb.w r5, [r0, r12] +; CHECK-NEXT: add.w r7, r1, r12 +; CHECK-NEXT: ldrb.w r6, [r1, r12] +; CHECK-NEXT: smlabb r5, r6, r5, r2 +; CHECK-NEXT: str r5, [r4, #-8] +; CHECK-NEXT: add.w r5, r0, r12 +; CHECK-NEXT: ldrb r6, [r7, #1] +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb.w r8, [r5, #1] -; CHECK-NEXT: ldrb r6, [r4, #1] ; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r7, #8] +; CHECK-NEXT: str r6, [r4, #-4] ; CHECK-NEXT: ldrb.w r8, [r5, #2] -; CHECK-NEXT: ldrb r6, [r4, #2] +; CHECK-NEXT: ldrb r6, [r7, #2] ; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r7, #12] -; CHECK-NEXT: bne .LBB5_7 -; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup.loopexit.unr-lcssa.loopexit -; CHECK-NEXT: rsb.w r7, r9, #0 -; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, lr, .LBB5_12 -; CHECK-NEXT: @ %bb.10: @ %for.body.epil.preheader -; CHECK-NEXT: subs r7, #1 -; CHECK-NEXT: add r0, r7 -; CHECK-NEXT: add r1, r7 -; CHECK-NEXT: add.w r3, r3, r7, lsl #2 -; CHECK-NEXT: .LBB5_11: @ %for.body.epil +; CHECK-NEXT: str r6, [r4] +; CHECK-NEXT: ldrb r5, [r5, #3] +; CHECK-NEXT: ldrb r6, [r7, #3] +; CHECK-NEXT: smlabb r5, r6, r5, r2 +; CHECK-NEXT: str r5, [r4, #4] +; CHECK-NEXT: adds r4, #16 +; CHECK-NEXT: le lr, .LBB5_7 +; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa +; CHECK-NEXT: wls lr, r9, .LBB5_11 +; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader +; CHECK-NEXT: add r0, r12 +; CHECK-NEXT: add r1, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsl #2 +; CHECK-NEXT: mov lr, r9 +; CHECK-NEXT: .LBB5_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrb r7, [r0, #1]! -; CHECK-NEXT: ldrb r6, [r1, #1]! +; CHECK-NEXT: ldrb r7, [r0], #1 +; CHECK-NEXT: ldrb r6, [r1], #1 ; CHECK-NEXT: smlabb r7, r6, r7, r2 -; CHECK-NEXT: str r7, [r3, #4]! -; CHECK-NEXT: le lr, .LBB5_11 -; CHECK-NEXT: .LBB5_12: @ %for.cond.cleanup +; CHECK-NEXT: str r7, [r3], #4 +; CHECK-NEXT: le lr, .LBB5_10 +; CHECK-NEXT: .LBB5_11: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %res12 = bitcast i32* %res to i8* @@ -647,93 +625,91 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: ldr.w r12, [sp, #28] ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: beq.w .LBB7_12 +; CHECK-NEXT: beq.w .LBB7_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12 -; CHECK-NEXT: cmp r5, r1 -; CHECK-NEXT: add.w r4, r0, r12 +; CHECK-NEXT: add.w r4, r3, r12, lsl #2 +; CHECK-NEXT: add.w r5, r1, r12 +; CHECK-NEXT: cmp r4, r1 +; CHECK-NEXT: add.w r6, r0, r12 ; CHECK-NEXT: cset r7, hi -; CHECK-NEXT: cmp r6, r3 -; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: cmp r5, r0 +; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r3 +; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: ands r5, r4 -; CHECK-NEXT: lsls r5, r5, #31 +; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cset r6, hi +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: ands r6, r4 +; CHECK-NEXT: lsls r6, r6, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq r7, r6 -; CHECK-NEXT: lslseq.w r7, r7, #31 +; CHECK-NEXT: andeq.w r4, r5, r7 +; CHECK-NEXT: lslseq.w r4, r4, #31 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader ; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and lr, r12, #3 +; CHECK-NEXT: and r9, r12, #3 ; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: b .LBB7_9 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: b .LBB7_8 ; CHECK-NEXT: .LBB7_4: @ %vector.ph -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r5, r0, r4 -; CHECK-NEXT: vldrb.u32 q0, [r5] -; CHECK-NEXT: adds r5, r1, r4 -; CHECK-NEXT: vldrb.u32 q1, [r5] +; CHECK-NEXT: vldrb.u32 q0, [r0], #4 +; CHECK-NEXT: vldrb.u32 q1, [r1], #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vadd.i32 q0, q0, r2 ; CHECK-NEXT: vstrw.32 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB7_5 -; CHECK-NEXT: b .LBB7_12 +; CHECK-NEXT: b .LBB7_11 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r12, lr, r12 -; CHECK-NEXT: subs r4, r1, #3 -; CHECK-NEXT: subs r5, r0, #3 -; CHECK-NEXT: sub.w r7, r3, #16 -; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: bic r7, r12, #3 +; CHECK-NEXT: add.w r4, r3, #8 +; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: add.w lr, lr, r7, lsr #2 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrb.w r8, [r5, #3] -; CHECK-NEXT: sub.w r9, r9, #4 -; CHECK-NEXT: ldrb r6, [r4, #3] -; CHECK-NEXT: cmp r12, r9 -; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r7, #16]! -; CHECK-NEXT: ldrb r8, [r5, #4]! -; CHECK-NEXT: ldrb r6, [r4, #4]! -; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r7, #4] +; CHECK-NEXT: ldrb.w r5, [r0, r12] +; CHECK-NEXT: add.w r7, r1, r12 +; CHECK-NEXT: ldrb.w r6, [r1, r12] +; CHECK-NEXT: smlabb r5, r6, r5, r2 +; CHECK-NEXT: str r5, [r4, #-8] +; CHECK-NEXT: add.w r5, r0, r12 +; CHECK-NEXT: ldrb r6, [r7, #1] +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: ldrb.w r8, [r5, #1] -; CHECK-NEXT: ldrb r6, [r4, #1] ; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r7, #8] +; CHECK-NEXT: str r6, [r4, #-4] ; CHECK-NEXT: ldrb.w r8, [r5, #2] -; CHECK-NEXT: ldrb r6, [r4, #2] +; CHECK-NEXT: ldrb r6, [r7, #2] ; CHECK-NEXT: smlabb r6, r6, r8, r2 -; CHECK-NEXT: str r6, [r7, #12] -; CHECK-NEXT: bne .LBB7_7 -; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup.loopexit.unr-lcssa.loopexit -; CHECK-NEXT: rsb.w r7, r9, #0 -; CHECK-NEXT: .LBB7_9: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, lr, .LBB7_12 -; CHECK-NEXT: @ %bb.10: @ %for.body.epil.preheader -; CHECK-NEXT: subs r7, #1 -; CHECK-NEXT: add r0, r7 -; CHECK-NEXT: add r1, r7 -; CHECK-NEXT: add.w r3, r3, r7, lsl #2 -; CHECK-NEXT: .LBB7_11: @ %for.body.epil +; CHECK-NEXT: str r6, [r4] +; CHECK-NEXT: ldrb r5, [r5, #3] +; CHECK-NEXT: ldrb r6, [r7, #3] +; CHECK-NEXT: smlabb r5, r6, r5, r2 +; CHECK-NEXT: str r5, [r4, #4] +; CHECK-NEXT: adds r4, #16 +; CHECK-NEXT: le lr, .LBB7_7 +; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa +; CHECK-NEXT: wls lr, r9, .LBB7_11 +; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader +; CHECK-NEXT: add r0, r12 +; CHECK-NEXT: add r1, r12 +; CHECK-NEXT: add.w r3, r3, r12, lsl #2 +; CHECK-NEXT: mov lr, r9 +; CHECK-NEXT: .LBB7_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrb r7, [r0, #1]! -; CHECK-NEXT: ldrb r6, [r1, #1]! +; CHECK-NEXT: ldrb r7, [r0], #1 +; CHECK-NEXT: ldrb r6, [r1], #1 ; CHECK-NEXT: smlabb r7, r6, r7, r2 -; CHECK-NEXT: str r7, [r3, #4]! -; CHECK-NEXT: le lr, .LBB7_11 -; CHECK-NEXT: .LBB7_12: @ %for.cond.cleanup +; CHECK-NEXT: str r7, [r3], #4 +; CHECK-NEXT: le lr, .LBB7_10 +; CHECK-NEXT: .LBB7_11: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %res12 = bitcast i32* %res to i8* @@ -941,33 +917,33 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %c, i32* nocapture %res, i32 %N) { ; CHECK-LABEL: test_vec_mul_scalar_add_int: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: ldr.w r12, [sp, #32] +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: ldr.w r12, [sp, #28] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB9_11 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck -; CHECK-NEXT: add.w r4, r3, r12, lsl #2 -; CHECK-NEXT: add.w r5, r1, r12, lsl #2 -; CHECK-NEXT: cmp r4, r1 -; CHECK-NEXT: add.w r6, r0, r12, lsl #2 -; CHECK-NEXT: cset r7, hi -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r0 +; CHECK-NEXT: add.w r6, r3, r12, lsl #2 +; CHECK-NEXT: add.w r4, r1, r12, lsl #2 +; CHECK-NEXT: cmp r6, r1 +; CHECK-NEXT: add.w r5, r0, r12, lsl #2 +; CHECK-NEXT: cset lr, hi +; CHECK-NEXT: cmp r4, r3 ; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: cmp r6, r3 +; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: ands r6, r4 -; CHECK-NEXT: lsls r6, r6, #31 +; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cset r5, hi +; CHECK-NEXT: ands r5, r6 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r4, r5, r7 -; CHECK-NEXT: lslseq.w r4, r4, #31 +; CHECK-NEXT: andeq.w r5, r4, lr +; CHECK-NEXT: lslseq.w r5, r5, #31 ; CHECK-NEXT: beq .LBB9_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and r10, r12, #3 -; CHECK-NEXT: cmp r4, #3 +; CHECK-NEXT: sub.w r5, r12, #1 +; CHECK-NEXT: and r9, r12, #3 +; CHECK-NEXT: cmp r5, #3 ; CHECK-NEXT: bhs .LBB9_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: mov.w r12, #0 @@ -985,54 +961,53 @@ ; CHECK-NEXT: letp lr, .LBB9_5 ; CHECK-NEXT: b .LBB9_11 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new -; CHECK-NEXT: bic r7, r12, #3 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: bic r5, r12, #3 +; CHECK-NEXT: add.w r4, r3, #8 +; CHECK-NEXT: subs r5, #4 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: add.w lr, lr, r7, lsr #2 +; CHECK-NEXT: add.w lr, r6, r5, lsr #2 +; CHECK-NEXT: add.w r5, r0, #8 +; CHECK-NEXT: add.w r6, r1, #8 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r5, [r0, r4] -; CHECK-NEXT: add.w r9, r0, r4 -; CHECK-NEXT: ldr r6, [r1, r4] -; CHECK-NEXT: adds r7, r1, r4 +; CHECK-NEXT: ldr r8, [r5, #-8] ; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: mla r5, r6, r5, r2 -; CHECK-NEXT: str r5, [r3, r4] -; CHECK-NEXT: ldr.w r8, [r9, #4] -; CHECK-NEXT: ldr r6, [r7, #4] -; CHECK-NEXT: mla r8, r6, r8, r2 -; CHECK-NEXT: adds r6, r3, r4 +; CHECK-NEXT: ldr r7, [r6, #-8] +; CHECK-NEXT: mla r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4, #-8] +; CHECK-NEXT: ldr r8, [r5, #-4] +; CHECK-NEXT: ldr r7, [r6, #-4] +; CHECK-NEXT: mla r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4, #-4] +; CHECK-NEXT: ldr.w r8, [r5] +; CHECK-NEXT: ldr r7, [r6] +; CHECK-NEXT: mla r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4] +; CHECK-NEXT: ldr.w r8, [r5, #4] +; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: ldr r7, [r6, #4] +; CHECK-NEXT: adds r6, #16 +; CHECK-NEXT: mla r7, r7, r8, r2 +; CHECK-NEXT: str r7, [r4, #4] ; CHECK-NEXT: adds r4, #16 -; CHECK-NEXT: str.w r8, [r6, #4] -; CHECK-NEXT: ldr.w r8, [r9, #8] -; CHECK-NEXT: ldr r5, [r7, #8] -; CHECK-NEXT: mla r5, r5, r8, r2 -; CHECK-NEXT: str r5, [r6, #8] -; CHECK-NEXT: ldr.w r5, [r9, #12] -; CHECK-NEXT: ldr r7, [r7, #12] -; CHECK-NEXT: mla r5, r7, r5, r2 -; CHECK-NEXT: str r5, [r6, #12] ; CHECK-NEXT: le lr, .LBB9_7 ; CHECK-NEXT: .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, r10, .LBB9_11 +; CHECK-NEXT: wls lr, r9, .LBB9_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader -; CHECK-NEXT: mvn r7, #3 -; CHECK-NEXT: mov lr, r10 -; CHECK-NEXT: add.w r7, r7, r12, lsl #2 -; CHECK-NEXT: add r0, r7 -; CHECK-NEXT: add r1, r7 -; CHECK-NEXT: add r3, r7 +; CHECK-NEXT: add.w r0, r0, r12, lsl #2 +; CHECK-NEXT: add.w r1, r1, r12, lsl #2 +; CHECK-NEXT: add.w r3, r3, r12, lsl #2 +; CHECK-NEXT: mov lr, r9 ; CHECK-NEXT: .LBB9_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r7, [r0, #4]! -; CHECK-NEXT: ldr r6, [r1, #4]! -; CHECK-NEXT: mla r7, r6, r7, r2 -; CHECK-NEXT: str r7, [r3, #4]! +; CHECK-NEXT: ldr r6, [r0], #4 +; CHECK-NEXT: ldr r5, [r1], #4 +; CHECK-NEXT: mla r6, r5, r6, r2 +; CHECK-NEXT: str r6, [r3], #4 ; CHECK-NEXT: le lr, .LBB9_10 ; CHECK-NEXT: .LBB9_11: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.memcheck @@ -1162,24 +1137,20 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture %a, i8* nocapture readonly %b, i8* nocapture readonly %c, i32 %N) { ; CHECK-LABEL: test_v8i8_to_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r4, pc} -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: dlstp.16 lr, r3 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r4, r1, r12 -; CHECK-NEXT: vldrb.u16 q0, [r4] -; CHECK-NEXT: add.w r4, r2, r12 -; CHECK-NEXT: add.w r12, r12, #8 -; CHECK-NEXT: vldrb.u16 q1, [r4] +; CHECK-NEXT: vldrb.u16 q0, [r1], #8 +; CHECK-NEXT: vldrb.u16 q1, [r2], #8 ; CHECK-NEXT: vmul.i16 q0, q1, q0 ; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -292,25 +292,20 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i32 %N) { ; CHECK-LABEL: vector_mul_vector_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r4, pc} -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: dlstp.8 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add.w r4, r1, r12 -; CHECK-NEXT: vldrb.u8 q0, [r4] -; CHECK-NEXT: add.w r4, r2, r12 -; CHECK-NEXT: vldrb.u8 q1, [r4] -; CHECK-NEXT: add.w r4, r0, r12 -; CHECK-NEXT: add.w r12, r12, #16 +; CHECK-NEXT: vldrb.u8 q0, [r1], #16 +; CHECK-NEXT: vldrb.u8 q1, [r2], #16 ; CHECK-NEXT: vmul.i8 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r4] +; CHECK-NEXT: vstrb.8 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph Index: llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -747,12 +747,11 @@ ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #16 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB22_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vptt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q1, [q0] ; CHECK-NEXT: vstrwt.32 q1, [r0], #16 @@ -793,12 +792,11 @@ ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #16 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB23_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vptt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q1, [q0] ; CHECK-NEXT: vstrwt.32 q1, [r0], #16 Index: llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll +++ llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll @@ -7,17 +7,15 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #16 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #16 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vshl.u32 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #16]! +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %exit ; CHECK-NEXT: pop {r7, pc} @@ -53,17 +51,15 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #8 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #8 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #8]! +; CHECK-NEXT: vldrw.u32 q0, [r0], #8 ; CHECK-NEXT: vshl.u16 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #8]! +; CHECK-NEXT: vstrb.8 q0, [r1], #8 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %exit ; CHECK-NEXT: pop {r7, pc} @@ -99,17 +95,15 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #4]! +; CHECK-NEXT: vldrw.u32 q0, [r0], #4 ; CHECK-NEXT: vshl.u8 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #4]! +; CHECK-NEXT: vstrb.8 q0, [r1], #4 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %exit ; CHECK-NEXT: pop {r7, pc} @@ -145,18 +139,16 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #16 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #16 -; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vshl.u32 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #16]! +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %exit ; CHECK-NEXT: pop {r7, pc} @@ -192,18 +184,16 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #8 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #8]! +; CHECK-NEXT: vldrw.u32 q0, [r0], #8 ; CHECK-NEXT: vshl.u16 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #8]! +; CHECK-NEXT: vstrb.8 q0, [r1], #8 ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %exit ; CHECK-NEXT: pop {r7, pc} @@ -239,18 +229,16 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #4]! +; CHECK-NEXT: vldrw.u32 q0, [r0], #4 ; CHECK-NEXT: vshl.u8 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #4]! +; CHECK-NEXT: vstrb.8 q0, [r1], #4 ; CHECK-NEXT: le lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %exit ; CHECK-NEXT: pop {r7, pc} @@ -286,18 +274,16 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #16 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #16 -; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vshl.s32 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #16]! +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %exit ; CHECK-NEXT: pop {r7, pc} @@ -333,18 +319,16 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #8 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #8]! +; CHECK-NEXT: vldrw.u32 q0, [r0], #8 ; CHECK-NEXT: vshl.s16 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #8]! +; CHECK-NEXT: vstrb.8 q0, [r1], #8 ; CHECK-NEXT: le lr, .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %exit ; CHECK-NEXT: pop {r7, pc} @@ -380,18 +364,16 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #4]! +; CHECK-NEXT: vldrw.u32 q0, [r0], #4 ; CHECK-NEXT: vshl.s8 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #4]! +; CHECK-NEXT: vstrb.8 q0, [r1], #4 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %exit ; CHECK-NEXT: pop {r7, pc} Index: llvm/test/CodeGen/Thumb2/mve-vldst4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -20,24 +20,22 @@ ; CHECK-NEXT: vldr.16 s0, [sp, #176] ; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: sub.w r12, r0, #64 ; CHECK-NEXT: add.w lr, r3, r2, lsr #3 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 -; CHECK-NEXT: subs r1, #64 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q0, [r12, #64]! -; CHECK-NEXT: vldrh.u16 q6, [r12, #32] -; CHECK-NEXT: vldrh.u16 q4, [r12, #48] -; CHECK-NEXT: vldrh.u16 q7, [r12, #16] -; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vldrh.u16 q6, [r0, #32] +; CHECK-NEXT: vldrh.u16 q4, [r0, #48] +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vldrh.u16 q7, [r0, #16] +; CHECK-NEXT: vmov r3, s24 ; CHECK-NEXT: vmovx.f16 s12, s16 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov.16 q1[4], r3 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov.16 q1[5], r2 ; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov r2, s0 @@ -50,6 +48,7 @@ ; CHECK-NEXT: vmov.16 q1[7], r2 ; CHECK-NEXT: vmov r2, s30 ; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: adds r0, #64 ; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload @@ -57,179 +56,180 @@ ; CHECK-NEXT: vmovx.f16 s4, s2 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmovx.f16 s8, s28 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov.16 q1[1], r3 ; CHECK-NEXT: vmovx.f16 s8, s26 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmovx.f16 s8, s24 -; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: vmov r12, s23 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vmov.16 q2[4], r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov r3, s27 +; CHECK-NEXT: vmov.16 q2[5], r2 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmovx.f16 s12, s30 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.16 q1[3], r2 ; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s6, s10 ; CHECK-NEXT: vstrw.32 q5, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov.16 q2[0], r12 ; CHECK-NEXT: vmul.f16 q1, q1, q3 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r2, s7 ; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s4, s22 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmovx.f16 s4, s23 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov r2, s25 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s29 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov r0, s31 -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov r2, s29 +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov r2, s31 +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: vmovx.f16 s4, s29 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmovx.f16 s4, s27 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmovx.f16 s4, s25 ; CHECK-NEXT: vmul.f16 q5, q2, q3 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmovx.f16 s8, s17 ; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r12, s20 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmovx.f16 s8, s19 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmovx.f16 s8, s31 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q0[3], r2 ; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmul.f16 q6, q0, q3 -; CHECK-NEXT: vmovx.f16 s0, s23 -; CHECK-NEXT: vmov r0, s27 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s27 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.16 q1[2], r12 +; CHECK-NEXT: vmul.f16 q4, q0, q3 +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmovx.f16 s20, s23 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s16 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s16, s19 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q6[1], r3 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q6[5], r2 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vmov.16 q3[2], r2 ; CHECK-NEXT: vmovx.f16 s0, s21 -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s25 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q7[0], r0 +; CHECK-NEXT: vmov.16 q3[3], r3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q3[7], r2 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov.16 q7[0], r2 ; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov.16 q7[1], r2 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q7[1], r3 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q7[5], r0 -; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q7[4], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q7[5], r2 +; CHECK-NEXT: vmov r2, s22 ; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s24 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s20, s22 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmovx.f16 s4, s4 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmovx.f16 s20, s26 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vmov.16 q3[3], r3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vmov.16 q3[7], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.16 q1[3], r3 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmovx.f16 s16, s11 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q4[5], r2 ; CHECK-NEXT: vmov.f32 s1, s13 ; CHECK-NEXT: vmov.f32 s29, s9 -; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov.f32 s31, s11 -; CHECK-NEXT: vmov q2, q5 -; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vmov.f32 s25, s21 +; CHECK-NEXT: vstrh.16 q7, [r1, #16] ; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s27, s23 ; CHECK-NEXT: vmov.f32 s3, s15 -; CHECK-NEXT: vstrb.8 q0, [r1, #64]! -; CHECK-NEXT: vmov.f32 s19, s27 +; CHECK-NEXT: vstrh.16 q6, [r1] ; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vstrh.16 q4, [r1, #48] -; CHECK-NEXT: vstrh.16 q2, [r1, #32] -; CHECK-NEXT: vstrh.16 q7, [r1, #16] +; CHECK-NEXT: vstrh.16 q0, [r1, #32] +; CHECK-NEXT: vstrh.16 q2, [r1, #48] +; CHECK-NEXT: adds r1, #64 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end ; CHECK-NEXT: add sp, #104 Index: llvm/test/CodeGen/Thumb2/mve-vmla.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmla.ll +++ llvm/test/CodeGen/Thumb2/mve-vmla.ll @@ -82,15 +82,13 @@ define void @vmla32_in_loop(i32* %s1, i32 %x, i32* %d, i32 %n) { ; CHECK-LABEL: vmla32_in_loop: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #16 -; CHECK-NEXT: subs r2, #16 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16]! -; CHECK-NEXT: vldrw.u32 q1, [r2, #16]! +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmla.u32 q1, q0, r1 -; CHECK-NEXT: vstrw.32 q1, [r2] +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -122,15 +120,13 @@ define void @vmla16_in_loop(i16* %s1, i16 %x, i16* %d, i32 %n) { ; CHECK-LABEL: vmla16_in_loop: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #16 -; CHECK-NEXT: subs r2, #16 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q0, [r0, #16]! -; CHECK-NEXT: vldrh.u16 q1, [r2, #16]! +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vmla.u16 q1, q0, r1 -; CHECK-NEXT: vstrh.16 q1, [r2] +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -162,15 +158,13 @@ define void @vmla8_in_loop(i8* %s1, i8 %x, i8* %d, i32 %n) { ; CHECK-LABEL: vmla8_in_loop: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #16 -; CHECK-NEXT: subs r2, #16 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q0, [r0, #16]! -; CHECK-NEXT: vldrh.u16 q1, [r2, #16]! +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: subs r3, #16 ; CHECK-NEXT: vmla.u8 q1, q0, r1 -; CHECK-NEXT: vstrh.16 q1, [r2] +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr