diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -361,7 +361,8 @@ bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override { - return MI->getOpcode() == ARM::t2LoopEndDec; + return MI->getOpcode() == ARM::t2LoopEndDec || + MI->getOpcode() == ARM::t2DoLoopStartTP; } private: diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -5427,6 +5427,7 @@ t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br, [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>; +let isTerminator = 1, hasSideEffects = 1 in def t2DoLoopStartTP : t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>; diff --git a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp --- a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp @@ -341,13 +341,10 @@ for (MachineInstr &Use : MRI->use_instructions(LoopStart->getOperand(0).getReg())) if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) || - !DT->dominates(ML->getHeader(), Use.getParent())) - InsertPt = &Use; - if (InsertPt != MBB->end() && - !DT->dominates(MRI->getVRegDef(CountReg), &*InsertPt)) { - LLVM_DEBUG(dbgs() << " InsertPt does not dominate CountReg!\n"); - return false; - } + !DT->dominates(ML->getHeader(), Use.getParent())) { + LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n"); + return false; + } MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStartTP)) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll @@ -7,15 +7,15 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd r12, r4, [r0] -; CHECK-NEXT: ldrd r2, r3, [r0, #8] -; CHECK-NEXT: rsb r12, r12, r4, lsl #1 -; CHECK-NEXT: mov r4, r12 +; CHECK-NEXT: ldrd r12, r2, [r0] +; CHECK-NEXT: ldrd r4, r3, [r0, #8] +; CHECK-NEXT: rsb r12, r12, r2, lsl #1 +; CHECK-NEXT: mov r2, r12 ; CHECK-NEXT: dlstp.16 lr, r12 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r3], #16 -; CHECK-NEXT: vstrh.16 q0, [r2], #16 +; CHECK-NEXT: vstrh.16 q0, [r4], #16 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: ldr r2, [r0] diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -8,8 +8,8 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB0_1: @ %do.body.i ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r12], #16 @@ -17,30 +17,19 @@ ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit ; CHECK-NEXT: vmov s4, r1 -; CHECK-NEXT: mov r3, r1 ; CHECK-NEXT: vadd.f32 s0, s3, s3 -; CHECK-NEXT: cmp r1, #4 ; CHECK-NEXT: vcvt.f32.u32 s4, s4 -; CHECK-NEXT: it ge -; CHECK-NEXT: movge r3, #4 -; CHECK-NEXT: subs r3, r1, r3 -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: adds r3, #3 -; CHECK-NEXT: add.w lr, lr, r3, lsr #2 ; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdiv.f32 s0, s0, s4 ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB0_3: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 -; CHECK-NEXT: vsubt.f32 q1, q1, r12 -; CHECK-NEXT: vfmat.f32 q0, q1, q1 -; CHECK-NEXT: le lr, .LBB0_3 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vsub.f32 q1, q1, r12 +; CHECK-NEXT: vfma.f32 q0, q1, q1 +; CHECK-NEXT: letp lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %do.end ; CHECK-NEXT: subs r0, r1, #1 ; CHECK-NEXT: vadd.f32 s0, s3, s3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -69,7 +69,7 @@ ; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 @@ -78,17 +78,17 @@ ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u16 q0, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 ; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vadd.i16 q1, q0, q1 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 -; CHECK-NEXT: vadd.i16 q0, q0, q2 +; CHECK-NEXT: vadd.i16 q1, q1, q2 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -142,7 +142,7 @@ ; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: add.w r3, r2, #15 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #15 ; CHECK-NEXT: sub.w r12, r3, #16 ; CHECK-NEXT: movs r3, #1 @@ -151,16 +151,16 @@ ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.8 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u8 q0, [r1], #16 +; CHECK-NEXT: vldrbt.u8 q1, [r1], #16 ; CHECK-NEXT: vldrbt.u8 q2, [r0], #16 ; CHECK-NEXT: subs r2, #16 -; CHECK-NEXT: vsub.i8 q0, q2, q0 -; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vsub.i8 q1, q2, q1 +; CHECK-NEXT: vadd.i8 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -212,7 +212,7 @@ ; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 @@ -221,16 +221,16 @@ ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u16 q0, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 ; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: vsub.i16 q0, q2, q0 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vsub.i16 q1, q2, q1 +; CHECK-NEXT: vadd.i16 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -284,7 +284,7 @@ ; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: add.w r3, r2, #15 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #15 ; CHECK-NEXT: sub.w r12, r3, #16 ; CHECK-NEXT: movs r3, #1 @@ -293,16 +293,16 @@ ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.8 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u8 q0, [r0], #16 +; CHECK-NEXT: vldrbt.u8 q1, [r0], #16 ; CHECK-NEXT: vldrbt.u8 q2, [r1], #16 ; CHECK-NEXT: subs r2, #16 -; CHECK-NEXT: vmul.i8 q0, q2, q0 -; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmul.i8 q1, q2, q1 +; CHECK-NEXT: vadd.i8 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB4_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -354,7 +354,7 @@ ; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 @@ -363,16 +363,16 @@ ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u16 q0, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 ; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: vmul.i16 q0, q2, q0 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vmul.i16 q1, q2, q1 +; CHECK-NEXT: vadd.i16 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -423,7 +423,7 @@ ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: subs r6, r3, #4 @@ -435,16 +435,16 @@ ; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u32 q0, [r4], #4 +; CHECK-NEXT: vldrbt.u32 q1, [r4], #4 ; CHECK-NEXT: vldrbt.u32 q2, [r5], #4 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vmul.i32 q0, q2, q0 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB6_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r12, q0 ; CHECK-NEXT: cbz r2, .LBB6_7 ; CHECK-NEXT: @ %bb.4: @ %vector.ph47 @@ -550,32 +550,32 @@ ; CHECK-NEXT: cbz r2, .LBB7_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: add.w lr, r4, r3, lsr #3 ; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u16 q0, [r3], #8 +; CHECK-NEXT: vldrbt.u16 q1, [r3], #8 ; CHECK-NEXT: vldrbt.u16 q4, [r4], #8 ; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vsub.i16 q3, q4, q0 -; CHECK-NEXT: vmul.i16 q0, q4, q0 +; CHECK-NEXT: vsub.i16 q3, q4, q1 +; CHECK-NEXT: vmul.i16 q1, q4, q1 ; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vadd.i16 q3, q3, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vadd.i16 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB7_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r4, q2 ; CHECK-NEXT: vaddv.u16 r2, q0 ; CHECK-NEXT: b .LBB7_5 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -53,11 +53,11 @@ define void @nested(i32* nocapture readonly %x, i32* nocapture readnone %y, i32* nocapture %z, i32 %m, i32 %n) { ; CHECK-LABEL: nested: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: cbz r3, .LBB1_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r5, [sp, #24] +; CHECK-NEXT: ldr r5, [sp, #28] ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: b .LBB1_4 @@ -77,11 +77,11 @@ ; CHECK-NEXT: beq .LBB1_2 ; CHECK-NEXT: @ %bb.5: @ %do.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: bic r6, r7, #3 -; CHECK-NEXT: dlstp.32 lr, r5 +; CHECK-NEXT: bic r9, r7, #3 ; CHECK-NEXT: mov r7, r5 -; CHECK-NEXT: add.w r8, r0, r6, lsl #2 ; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: add.w r8, r0, r9, lsl #2 +; CHECK-NEXT: dlstp.32 lr, r5 ; CHECK-NEXT: .LBB1_6: @ %do.body ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -90,11 +90,11 @@ ; CHECK-NEXT: letp lr, .LBB1_6 ; CHECK-NEXT: @ %bb.7: @ %if.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: subs r5, r5, r6 +; CHECK-NEXT: sub.w r5, r5, r9 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %cmp20.not = icmp eq i32 %m, 0 br i1 %cmp20.not, label %for.cond.cleanup, label %for.body diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -266,8 +266,8 @@ ; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: eor r12, r12, #-2147483648 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -530,8 +530,8 @@ ; CHECK-NEXT: .LBB8_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: eor r12, r12, #-2147483648 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -9,11 +9,11 @@ ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI0_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 @@ -80,11 +80,11 @@ ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: add.w r4, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI1_0 -; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add.w r12, r3, #4 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vmov.i32 q0, #0x14 +; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -155,8 +155,8 @@ ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI2_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vmov.i32 q2, #0x3 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -15,34 +15,34 @@ ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: ldr r5, [r0, #8] ; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: add.w r4, r3, r5, lsl #2 +; CHECK-NEXT: add.w r3, r3, r5, lsl #2 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsl.w r9, r5, #2 ; CHECK-NEXT: .LBB0_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: mov r7, r3 +; CHECK-NEXT: mov r4, r5 ; CHECK-NEXT: dlstp.32 lr, r5 -; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: mov r6, r5 ; CHECK-NEXT: .LBB0_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q1, [r7], #16 -; CHECK-NEXT: vldrw.u32 q2, [r3], #16 +; CHECK-NEXT: vldrw.u32 q1, [r6], #16 +; CHECK-NEXT: vldrw.u32 q2, [r7], #16 ; CHECK-NEXT: vfma.f32 q0, q2, q1 ; CHECK-NEXT: letp lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: vadd.f32 s4, s2, s3 -; CHECK-NEXT: add.w r3, r2, r0, lsl #2 +; CHECK-NEXT: add.w r7, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: add r4, r9 +; CHECK-NEXT: add r3, r9 ; CHECK-NEXT: cmp r0, r12 ; CHECK-NEXT: vadd.f32 s0, s0, s4 -; CHECK-NEXT: vstr s0, [r3] +; CHECK-NEXT: vstr s0, [r7] ; CHECK-NEXT: bne .LBB0_2 ; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} @@ -115,43 +115,35 @@ ; CHECK-NEXT: ldr.w r12, [r0, #8] ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: add.w r0, r12, #3 -; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 -; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: add.w r11, r3, r12, lsl #2 ; CHECK-NEXT: add.w r7, r3, r12, lsl #3 ; CHECK-NEXT: lsl.w r9, r12, #3 -; CHECK-NEXT: add.w r8, r4, r0, lsr #2 ; CHECK-NEXT: .LBB1_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 -; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w r11, r4, #1 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: add.w r10, r4, #1 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB1_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r10 -; CHECK-NEXT: sub.w r10, r10, #4 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q2, [r6], #16 -; CHECK-NEXT: vldrwt.u32 q3, [r3], #16 -; CHECK-NEXT: vfmat.f32 q1, q3, q2 -; CHECK-NEXT: vldrwt.u32 q3, [r0], #16 -; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q0, q3, q2 -; CHECK-NEXT: le lr, .LBB1_3 +; CHECK-NEXT: vldrw.u32 q2, [r5], #16 +; CHECK-NEXT: vldrw.u32 q3, [r3], #16 +; CHECK-NEXT: vfma.f32 q1, q3, q2 +; CHECK-NEXT: vldrw.u32 q3, [r0], #16 +; CHECK-NEXT: vfma.f32 q0, q3, q2 +; CHECK-NEXT: letp lr, .LBB1_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: vadd.f32 s8, s2, s3 -; CHECK-NEXT: add.w r0, r2, r11, lsl #2 +; CHECK-NEXT: add.w r0, r2, r10, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: add r5, r9 +; CHECK-NEXT: add r11, r9 ; CHECK-NEXT: vadd.f32 s2, s6, s7 ; CHECK-NEXT: add r7, r9 ; CHECK-NEXT: vadd.f32 s4, s4, s5 @@ -241,84 +233,86 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] +; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: subs r1, #3 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo .LBB2_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r7, [r0, #8] +; CHECK-NEXT: ldr r3, [r0, #8] ; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r0, r7, r7, lsl #1 -; CHECK-NEXT: add.w r12, r3, r7, lsl #2 -; CHECK-NEXT: add.w r1, r3, r7, lsl #3 -; CHECK-NEXT: add.w r8, r3, r0, lsl #2 -; CHECK-NEXT: adds r3, r7, #3 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r3, r3, lsl #1 +; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r1, r3, lsl #3 +; CHECK-NEXT: adds r3, #3 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: lsls r7, r0, #2 +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add.w r10, r1, r0, lsl #2 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: add.w r3, r5, r3, lsr #2 -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: lsl.w r11, r0, #2 +; CHECK-NEXT: add.w r1, r5, r3, lsr #2 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 -; CHECK-NEXT: ldrd r0, r10, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w r9, r5, #2 -; CHECK-NEXT: add.w r11, r5, #1 -; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: mov r4, r8 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: adds r0, r5, #2 +; CHECK-NEXT: adds r2, r5, #1 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: mov r4, r10 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: mov r8, r7 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB2_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r10 -; CHECK-NEXT: sub.w r10, r10, #4 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q3, [r6], #16 -; CHECK-NEXT: vldrwt.u32 q4, [r3], #16 -; CHECK-NEXT: vfmat.f32 q1, q4, q3 -; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vfmat.f32 q2, q4, q3 -; CHECK-NEXT: vldrwt.u32 q4, [r4], #16 -; CHECK-NEXT: vfmat.f32 q0, q4, q3 -; CHECK-NEXT: le lr, .LBB2_3 +; CHECK-NEXT: vldrw.u32 q3, [r6], #16 +; CHECK-NEXT: vldrw.u32 q4, [r3], #16 +; CHECK-NEXT: vfma.f32 q1, q4, q3 +; CHECK-NEXT: vldrw.u32 q4, [r0], #16 +; CHECK-NEXT: vfma.f32 q2, q4, q3 +; CHECK-NEXT: vldrw.u32 q4, [r4], #16 +; CHECK-NEXT: vfma.f32 q0, q4, q3 +; CHECK-NEXT: letp lr, .LBB2_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 ; CHECK-NEXT: vadd.f32 s12, s10, s11 -; CHECK-NEXT: add.w r0, r2, r11, lsl #2 +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: add r12, r7 +; CHECK-NEXT: add r9, r11 ; CHECK-NEXT: vadd.f32 s10, s6, s7 -; CHECK-NEXT: add r1, r7 +; CHECK-NEXT: add.w r0, r1, r2, lsl #2 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: add r8, r7 +; CHECK-NEXT: add r12, r11 ; CHECK-NEXT: vadd.f32 s6, s2, s3 +; CHECK-NEXT: add r10, r11 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s2, s8, s12 ; CHECK-NEXT: vadd.f32 s4, s4, s10 ; CHECK-NEXT: vadd.f32 s0, s0, s6 ; CHECK-NEXT: vstr s2, [r0] -; CHECK-NEXT: add.w r0, r2, r5, lsl #2 +; CHECK-NEXT: add.w r0, r1, r5, lsl #2 ; CHECK-NEXT: adds r5, #3 ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: add.w r0, r2, r9, lsl #2 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: blo .LBB2_2 ; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -410,81 +404,76 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] +; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB3_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr r2, [r0, #8] ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: add.w r0, r3, r3, lsl #1 -; CHECK-NEXT: add.w r8, r1, r3, lsl #2 -; CHECK-NEXT: add.w r12, r1, r3, lsl #3 -; CHECK-NEXT: add.w r10, r1, r3, lsl #4 -; CHECK-NEXT: add.w r9, r1, r0, lsl #2 -; CHECK-NEXT: adds r0, r3, #3 +; CHECK-NEXT: add.w r0, r2, r2, lsl #1 +; CHECK-NEXT: add.w r12, r1, r2, lsl #2 +; CHECK-NEXT: add.w r8, r1, r2, lsl #3 +; CHECK-NEXT: add.w r9, r1, r2, lsl #4 +; CHECK-NEXT: add.w r11, r1, r0, lsl #2 +; CHECK-NEXT: adds r0, r2, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: lsls r7, r3, #4 ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: add.w r0, r6, r0, lsr #2 -; CHECK-NEXT: strd r0, r3, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: lsls r0, r2, #4 +; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB3_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 ; CHECK-NEXT: adds r0, r6, #3 -; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r0, r6, #2 -; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: adds r0, r6, #1 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: ldrd r0, r11, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r5, r9 -; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: mov r4, r10 +; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: adds r0, r6, #1 +; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r5, r11 +; CHECK-NEXT: mov r4, r9 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: mov r10, r7 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB3_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r11 -; CHECK-NEXT: sub.w r11, r11, #4 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q4, [r1], #16 -; CHECK-NEXT: vldrwt.u32 q5, [r0], #16 -; CHECK-NEXT: vfmat.f32 q3, q5, q4 -; CHECK-NEXT: vldrwt.u32 q5, [r3], #16 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vfmat.f32 q2, q5, q4 -; CHECK-NEXT: vldrwt.u32 q5, [r5], #16 -; CHECK-NEXT: vfmat.f32 q1, q5, q4 -; CHECK-NEXT: vldrwt.u32 q5, [r4], #16 -; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q0, q5, q4 -; CHECK-NEXT: le lr, .LBB3_3 +; CHECK-NEXT: vldrw.u32 q4, [r1], #16 +; CHECK-NEXT: vldrw.u32 q5, [r0], #16 +; CHECK-NEXT: vfma.f32 q3, q5, q4 +; CHECK-NEXT: vldrw.u32 q5, [r3], #16 +; CHECK-NEXT: vfma.f32 q2, q5, q4 +; CHECK-NEXT: vldrw.u32 q5, [r5], #16 +; CHECK-NEXT: vfma.f32 q1, q5, q4 +; CHECK-NEXT: vldrw.u32 q5, [r4], #16 +; CHECK-NEXT: vfma.f32 q0, q5, q4 +; CHECK-NEXT: letp lr, .LBB3_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 ; CHECK-NEXT: vadd.f32 s16, s14, s15 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: add r8, r7 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s14, s10, s11 -; CHECK-NEXT: add r12, r7 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: add.w r0, r2, r0, lsl #2 +; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s10, s6, s7 -; CHECK-NEXT: add r9, r7 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: add r10, r7 ; CHECK-NEXT: vadd.f32 s6, s2, s3 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s2, s12, s16 @@ -492,20 +481,25 @@ ; CHECK-NEXT: vadd.f32 s4, s4, s10 ; CHECK-NEXT: vadd.f32 s0, s0, s6 ; CHECK-NEXT: vstr s2, [r0] -; CHECK-NEXT: add.w r0, r2, r6, lsl #2 +; CHECK-NEXT: add.w r0, r1, r6, lsl #2 ; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: vstr s8, [r0] -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r2, r0, lsl #2 +; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r2, r0, lsl #2 +; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add r12, r0 +; CHECK-NEXT: add r8, r0 +; CHECK-NEXT: add r11, r0 +; CHECK-NEXT: add r9, r0 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: blo .LBB3_2 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #40 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -636,50 +630,46 @@ ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2 +; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: add.w r10, r0, #2 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: add.w r11, r0, #1 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: ldrd r1, r11, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: add.w r10, r0, #2 -; CHECK-NEXT: adds r7, r0, #1 -; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB4_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add.w r9, r3, r5 -; CHECK-NEXT: vctp.32 r11 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q5, [r1], #16 -; CHECK-NEXT: vldrwt.u32 q6, [r3], #16 -; CHECK-NEXT: vfmat.f32 q3, q6, q5 +; CHECK-NEXT: vldrw.u32 q5, [r4], #16 +; CHECK-NEXT: vldrw.u32 q6, [r3], #16 +; CHECK-NEXT: vfma.f32 q3, q6, q5 ; CHECK-NEXT: add.w r12, r9, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q6, [r9] -; CHECK-NEXT: vfmat.f32 q4, q6, q5 -; CHECK-NEXT: sub.w r11, r11, #4 -; CHECK-NEXT: add.w r4, r12, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q6, [r12] -; CHECK-NEXT: vfmat.f32 q2, q6, q5 -; CHECK-NEXT: adds r6, r4, r5 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q6, [r4] -; CHECK-NEXT: vfmat.f32 q0, q6, q5 -; CHECK-NEXT: vldrwt.u32 q6, [r6] -; CHECK-NEXT: vfmat.f32 q1, q6, q5 -; CHECK-NEXT: le lr, .LBB4_3 +; CHECK-NEXT: vldrw.u32 q6, [r9] +; CHECK-NEXT: vfma.f32 q4, q6, q5 +; CHECK-NEXT: add.w r6, r12, r5 +; CHECK-NEXT: vldrw.u32 q6, [r12] +; CHECK-NEXT: vfma.f32 q2, q6, q5 +; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vldrw.u32 q6, [r6] +; CHECK-NEXT: vfma.f32 q0, q6, q5 +; CHECK-NEXT: vldrw.u32 q6, [r7] +; CHECK-NEXT: vfma.f32 q1, q6, q5 +; CHECK-NEXT: letp lr, .LBB4_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1 ; CHECK-NEXT: vadd.f32 s20, s18, s19 -; CHECK-NEXT: add.w r1, r2, r7, lsl #2 +; CHECK-NEXT: add.w r1, r2, r11, lsl #2 ; CHECK-NEXT: vadd.f32 s16, s16, s17 ; CHECK-NEXT: vadd.f32 s18, s14, s15 ; CHECK-NEXT: vadd.f32 s12, s12, s13 @@ -844,7 +834,7 @@ ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r8, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 @@ -861,48 +851,43 @@ ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: ldrd r1, r8, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: add.w r11, r0, #2 +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #1 -; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov q5, q1 ; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: mov r9, r7 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB5_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add.w r12, r3, r5 -; CHECK-NEXT: vctp.32 r8 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q6, [r1], #16 -; CHECK-NEXT: vldrwt.u32 q7, [r3], #16 -; CHECK-NEXT: vfmat.f32 q4, q7, q6 +; CHECK-NEXT: vldrw.u32 q6, [r1], #16 +; CHECK-NEXT: vldrw.u32 q7, [r3], #16 +; CHECK-NEXT: vfma.f32 q4, q7, q6 ; CHECK-NEXT: add.w r10, r12, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q7, [r12] -; CHECK-NEXT: vfmat.f32 q5, q7, q6 +; CHECK-NEXT: vldrw.u32 q7, [r12] +; CHECK-NEXT: vfma.f32 q5, q7, q6 ; CHECK-NEXT: add.w r6, r10, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q7, [r10] -; CHECK-NEXT: vfmat.f32 q2, q7, q6 -; CHECK-NEXT: sub.w r8, r8, #4 +; CHECK-NEXT: vldrw.u32 q7, [r10] +; CHECK-NEXT: vfma.f32 q2, q7, q6 ; CHECK-NEXT: adds r7, r6, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q7, [r6] -; CHECK-NEXT: vfmat.f32 q0, q7, q6 +; CHECK-NEXT: vldrw.u32 q7, [r6] +; CHECK-NEXT: vfma.f32 q0, q7, q6 ; CHECK-NEXT: adds r6, r7, r5 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q7, [r7] -; CHECK-NEXT: vfmat.f32 q3, q7, q6 -; CHECK-NEXT: vldrwt.u32 q7, [r6] -; CHECK-NEXT: vfmat.f32 q1, q7, q6 -; CHECK-NEXT: le lr, .LBB5_3 +; CHECK-NEXT: vldrw.u32 q7, [r7] +; CHECK-NEXT: vfma.f32 q3, q7, q6 +; CHECK-NEXT: vldrw.u32 q7, [r6] +; CHECK-NEXT: vfma.f32 q1, q7, q6 +; CHECK-NEXT: letp lr, .LBB5_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1 ; CHECK-NEXT: vadd.f32 s24, s22, s23 @@ -940,7 +925,7 @@ ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: add r9, r1 +; CHECK-NEXT: add r8, r1 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB5_2 @@ -1090,7 +1075,7 @@ ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: add.w r9, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 @@ -1109,27 +1094,29 @@ ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: ldrd r3, r1, [sp, #16] @ 8-byte Folded Reload ; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #2 +; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: add.w r8, r0, #1 -; CHECK-NEXT: dls lr, r3 -; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vmov q6, q2 ; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: mov r12, r7 ; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB6_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add.w r10, r3, r5 -; CHECK-NEXT: vctp.32 r1 +; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q7, [r9], #16 +; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 ; CHECK-NEXT: vfmat.f32 q5, q0, q7 ; CHECK-NEXT: add.w r11, r10, r5 @@ -1159,7 +1146,7 @@ ; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r7] @@ -1215,7 +1202,7 @@ ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: add r12, r1 +; CHECK-NEXT: add r9, r1 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB6_2 @@ -1378,7 +1365,7 @@ ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 @@ -1392,33 +1379,35 @@ ; CHECK-NEXT: adds r1, r0, #7 ; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: ldrd r3, r10, [sp, #16] @ 8-byte Folded Reload ; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #5 +; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: dls lr, r3 -; CHECK-NEXT: ldr.w r12, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #3 +; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill ; CHECK-NEXT: add.w r8, r0, #2 ; CHECK-NEXT: adds r1, r0, #1 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q7, q3 ; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: mov r10, r7 ; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB7_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add.w r11, r3, r5 ; CHECK-NEXT: vctp.32 r10 ; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q0, [r12], #16 +; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vfmat.f32 q6, q1, q0 ; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill @@ -1515,7 +1504,7 @@ ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: add r9, r1 +; CHECK-NEXT: add r12, r1 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB7_2 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -683,8 +683,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #28 +; CHECK-NEXT: sub sp, #28 ; CHECK-NEXT: add.w r12, sp, #12 ; CHECK-NEXT: cmp r3, #4 ; CHECK-NEXT: stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill @@ -694,10 +694,10 @@ ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r2, [sp, #88] +; CHECK-NEXT: ldr r2, [sp, #92] ; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr.w r10, [sp, #72] +; CHECK-NEXT: ldr r4, [sp, #76] ; CHECK-NEXT: add.w r0, r1, r2, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r2 @@ -706,62 +706,66 @@ ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: adds r0, r2, #7 -; CHECK-NEXT: lsrs r2, r0, #3 +; CHECK-NEXT: lsrs r1, r0, #3 ; CHECK-NEXT: b .LBB5_5 ; CHECK-NEXT: .LBB5_3: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r8, r10 +; CHECK-NEXT: mov r12, r10 +; CHECK-NEXT: mov r6, r10 ; CHECK-NEXT: .LBB5_4: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: add.w r1, r12, r8 -; CHECK-NEXT: add r1, r6 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: ldr r1, [sp, #96] +; CHECK-NEXT: add.w r0, r12, r8 +; CHECK-NEXT: ldr r1, [sp, #100] +; CHECK-NEXT: add r0, r6 +; CHECK-NEXT: add r0, r10 ; CHECK-NEXT: strb.w r0, [r1, r9] ; CHECK-NEXT: add.w r9, r9, #1 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r9, r0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: .LBB5_5: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_7 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #92] -; CHECK-NEXT: cmp r2, r2 -; CHECK-NEXT: ldr.w r0, [r0, r9, lsl #2] +; CHECK-NEXT: ldr r0, [sp, #96] +; CHECK-NEXT: cmp r1, r1 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: ldr.w r10, [r0, r9, lsl #2] ; CHECK-NEXT: bge .LBB5_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: ldr.w r11, [sp, #88] -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: dlstp.16 lr, r11 -; CHECK-NEXT: ldm.w sp, {r4, r5, r7} @ 12-byte Folded Reload -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mla r3, r9, r11, r1 +; CHECK-NEXT: ldr r2, [sp, #92] +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r6, r10 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r12, r10 +; CHECK-NEXT: mla r3, r9, r2, r0 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r11, r2 +; CHECK-NEXT: mov r8, r10 +; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB5_7: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB5_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrb.s16 q0, [r4], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r10 +; CHECK-NEXT: vldrb.s16 q0, [r7], #8 +; CHECK-NEXT: vadd.i16 q1, q0, r4 ; CHECK-NEXT: vldrb.s16 q0, [r3], #8 -; CHECK-NEXT: vmlava.s16 r0, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r7], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r10 -; CHECK-NEXT: vmlava.s16 r6, q0, q1 +; CHECK-NEXT: vmlava.s16 r10, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r10 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r6, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r0], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r4 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r10 +; CHECK-NEXT: vadd.i16 q1, q1, r4 ; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: letp lr, .LBB5_7 ; CHECK-NEXT: b .LBB5_4 ; CHECK-NEXT: .LBB5_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #96] -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: ldr r0, [sp, #100] +; CHECK-NEXT: add sp, #28 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 @@ -870,8 +874,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #28 +; CHECK-NEXT: sub sp, #28 ; CHECK-NEXT: add.w r12, sp, #12 ; CHECK-NEXT: cmp r3, #4 ; CHECK-NEXT: stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill @@ -881,10 +885,10 @@ ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r2, [sp, #88] +; CHECK-NEXT: ldr r2, [sp, #92] ; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr.w r10, [sp, #72] +; CHECK-NEXT: ldr r4, [sp, #76] ; CHECK-NEXT: add.w r0, r1, r2, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r2 @@ -893,61 +897,65 @@ ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: adds r0, r2, #7 -; CHECK-NEXT: lsrs r2, r0, #3 +; CHECK-NEXT: lsrs r1, r0, #3 ; CHECK-NEXT: .LBB6_3: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_5 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #92] -; CHECK-NEXT: cmp r2, r2 -; CHECK-NEXT: ldr.w r0, [r0, r9, lsl #2] +; CHECK-NEXT: ldr r0, [sp, #96] +; CHECK-NEXT: cmp r1, r1 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: ldr.w r10, [r0, r9, lsl #2] ; CHECK-NEXT: bge .LBB6_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: ldr.w r11, [sp, #88] -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: dlstp.16 lr, r11 -; CHECK-NEXT: ldm.w sp, {r4, r5, r7} @ 12-byte Folded Reload -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mla r3, r9, r11, r1 +; CHECK-NEXT: ldr r2, [sp, #92] +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r6, r10 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r12, r10 +; CHECK-NEXT: mla r3, r9, r2, r0 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r11, r2 +; CHECK-NEXT: mov r8, r10 +; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB6_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB6_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrb.s16 q0, [r4], #8 -; CHECK-NEXT: vadd.i16 q1, q0, r10 +; CHECK-NEXT: vldrb.s16 q0, [r7], #8 +; CHECK-NEXT: vadd.i16 q1, q0, r4 ; CHECK-NEXT: vldrb.s16 q0, [r3], #8 -; CHECK-NEXT: vmlava.s16 r0, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r7], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r10 -; CHECK-NEXT: vmlava.s16 r6, q0, q1 +; CHECK-NEXT: vmlava.s16 r10, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r10 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r6, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r0], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r4 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r1], #8 -; CHECK-NEXT: vadd.i16 q1, q1, r10 +; CHECK-NEXT: vadd.i16 q1, q1, r4 ; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: letp lr, .LBB6_5 ; CHECK-NEXT: b .LBB6_7 ; CHECK-NEXT: .LBB6_6: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r8, r10 +; CHECK-NEXT: mov r12, r10 +; CHECK-NEXT: mov r6, r10 ; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: add.w r1, r12, r8 -; CHECK-NEXT: add r1, r6 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: ldr r1, [sp, #96] +; CHECK-NEXT: add.w r0, r12, r8 +; CHECK-NEXT: ldr r1, [sp, #100] +; CHECK-NEXT: add r0, r6 +; CHECK-NEXT: add r0, r10 ; CHECK-NEXT: strb.w r0, [r1, r9] ; CHECK-NEXT: add.w r9, r9, #1 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r9, r0 ; CHECK-NEXT: bne .LBB6_3 ; CHECK-NEXT: .LBB6_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #96] -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: ldr r0, [sp, #100] +; CHECK-NEXT: add sp, #28 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -14,8 +14,8 @@ ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vldrw.u32 q1, [r4] ; CHECK-NEXT: vmov.i32 q3, #0x4 -; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -2330,8 +2330,8 @@ ; CHECK-NEXT: cbz r1, .LBB29_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB29_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -2383,8 +2383,8 @@ ; CHECK-NEXT: cbz r2, .LBB30_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB30_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -2442,8 +2442,8 @@ ; CHECK-NEXT: cbz r2, .LBB31_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB31_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16