Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -361,7 +361,8 @@ bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; bool isUnspillableTerminator(const MachineInstr *MI) const override { - return MI->getOpcode() == ARM::t2LoopEndDec; + return MI->getOpcode() == ARM::t2LoopEndDec || + MI->getOpcode() == ARM::t2DoLoopStartTP; } private: Index: llvm/lib/Target/ARM/ARMInstrThumb2.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrThumb2.td +++ llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -5427,6 +5427,7 @@ t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br, [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>; +let isTerminator = 1, hasSideEffects = 1 in def t2DoLoopStartTP : t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>; Index: llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp =================================================================== --- llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -329,13 +329,10 @@ for (MachineInstr &Use : MRI->use_instructions(LoopStart->getOperand(0).getReg())) if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) || - !DT->dominates(ML->getHeader(), Use.getParent())) - InsertPt = &Use; - if (InsertPt != MBB->end() && - !DT->dominates(MRI->getVRegDef(CountReg), &*InsertPt)) { - LLVM_DEBUG(dbgs() << " InsertPt does not dominate CountReg!\n"); - return false; - } + !DT->dominates(ML->getHeader(), Use.getParent())) { + LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n"); + return false; + } MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStartTP)) Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll @@ -7,15 +7,15 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd r12, r4, [r0] -; CHECK-NEXT: ldrd r2, r3, [r0, #8] -; CHECK-NEXT: rsb r12, r12, r4, lsl #1 -; CHECK-NEXT: mov r4, r12 +; CHECK-NEXT: ldrd r12, r2, [r0] +; CHECK-NEXT: ldrd r4, r3, [r0, #8] +; CHECK-NEXT: rsb r12, r12, r2, lsl #1 +; CHECK-NEXT: mov r2, r12 ; CHECK-NEXT: dlstp.16 lr, r12 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r3], #16 -; CHECK-NEXT: vstrh.16 q0, [r2], #16 +; CHECK-NEXT: vstrh.16 q0, [r4], #16 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: ldr r2, [r0] Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -6,41 +6,30 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: mov r4, r1 -; CHECK-NEXT: cmp r1, #4 -; CHECK-NEXT: it ge -; CHECK-NEXT: movge r4, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r4, r1, r4 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: adds r4, #3 -; CHECK-NEXT: add.w r12, r3, r4, lsr #2 +; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: mov r3, r1 ; CHECK-NEXT: dlstp.32 lr, r1 -; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: .LBB0_1: @ %do.body.i ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r4], #16 +; CHECK-NEXT: vldrw.u32 q1, [r12], #16 ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit ; CHECK-NEXT: vmov s4, r1 -; CHECK-NEXT: dls lr, r12 -; CHECK-NEXT: vadd.f32 s0, s3, s3 ; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: vadd.f32 s0, s3, s3 ; CHECK-NEXT: vcvt.f32.u32 s4, s4 ; CHECK-NEXT: vdiv.f32 s0, s0, s4 -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB0_3: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 -; CHECK-NEXT: vsubt.f32 q1, q1, r4 -; CHECK-NEXT: vfmat.f32 q0, q1, q1 -; CHECK-NEXT: le lr, .LBB0_3 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vsub.f32 q1, q1, r12 +; CHECK-NEXT: vfma.f32 q0, q1, q1 +; CHECK-NEXT: letp lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %do.end ; CHECK-NEXT: subs r0, r1, #1 ; CHECK-NEXT: vadd.f32 s0, s3, s3 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -69,7 +69,7 @@ ; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 @@ -78,17 +78,17 @@ ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u16 q0, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 ; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vadd.i16 q1, q0, q1 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 -; CHECK-NEXT: vadd.i16 q0, q0, q2 +; CHECK-NEXT: vadd.i16 q1, q1, q2 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -142,7 +142,7 @@ ; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: add.w r3, r2, #15 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #15 ; CHECK-NEXT: sub.w r12, r3, #16 ; CHECK-NEXT: movs r3, #1 @@ -151,16 +151,16 @@ ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.8 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u8 q0, [r1], #16 +; CHECK-NEXT: vldrbt.u8 q1, [r1], #16 ; CHECK-NEXT: vldrbt.u8 q2, [r0], #16 ; CHECK-NEXT: subs r2, #16 -; CHECK-NEXT: vsub.i8 q0, q2, q0 -; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vsub.i8 q1, q2, q1 +; CHECK-NEXT: vadd.i8 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -212,7 +212,7 @@ ; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 @@ -221,16 +221,16 @@ ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u16 q0, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 ; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: vsub.i16 q0, q2, q0 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vsub.i16 q1, q2, q1 +; CHECK-NEXT: vadd.i16 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -284,7 +284,7 @@ ; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: add.w r3, r2, #15 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #15 ; CHECK-NEXT: sub.w r12, r3, #16 ; CHECK-NEXT: movs r3, #1 @@ -293,16 +293,16 @@ ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.8 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u8 q0, [r0], #16 +; CHECK-NEXT: vldrbt.u8 q1, [r0], #16 ; CHECK-NEXT: vldrbt.u8 q2, [r1], #16 ; CHECK-NEXT: subs r2, #16 -; CHECK-NEXT: vmul.i8 q0, q2, q0 -; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmul.i8 q1, q2, q1 +; CHECK-NEXT: vadd.i8 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB4_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -354,7 +354,7 @@ ; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 @@ -363,16 +363,16 @@ ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u16 q0, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 ; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 ; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: vmul.i16 q0, q2, q0 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vmul.i16 q1, q2, q1 +; CHECK-NEXT: vadd.i16 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -423,7 +423,7 @@ ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: subs r6, r3, #4 @@ -435,25 +435,25 @@ ; CHECK-NEXT: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u32 q0, [r4], #4 +; CHECK-NEXT: vldrbt.u32 q1, [r4], #4 ; CHECK-NEXT: vldrbt.u32 q2, [r5], #4 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vmul.i32 q0, q2, q0 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB6_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r12, q0 ; CHECK-NEXT: cbz r2, .LBB6_7 ; CHECK-NEXT: @ %bb.4: @ %vector.ph47 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r6, lsr #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: vmov.32 q0[0], r12 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_5: @ %vector.body46 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 @@ -550,32 +550,32 @@ ; CHECK-NEXT: cbz r2, .LBB7_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: add.w lr, r4, r3, lsr #3 ; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrbt.u16 q0, [r3], #8 +; CHECK-NEXT: vldrbt.u16 q1, [r3], #8 ; CHECK-NEXT: vldrbt.u16 q4, [r4], #8 ; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vsub.i16 q3, q4, q0 -; CHECK-NEXT: vmul.i16 q0, q4, q0 +; CHECK-NEXT: vsub.i16 q3, q4, q1 +; CHECK-NEXT: vmul.i16 q1, q4, q1 ; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vadd.i16 q3, q3, q2 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vadd.i16 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB7_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r4, q2 ; CHECK-NEXT: vaddv.u16 r2, q0 ; CHECK-NEXT: b .LBB7_5 @@ -673,11 +673,11 @@ ; CHECK-NEXT: add.w lr, r12, r1, lsr #2 ; CHECK-NEXT: movw r1, :lower16:days ; CHECK-NEXT: movt r1, :upper16:days -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: mla r1, r3, r4, r1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 Index: llvm/test/CodeGen/Thumb2/mve-fma-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-fma-loops.ll +++ llvm/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -265,9 +265,9 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: eor r12, r4, #-2147483648 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 @@ -529,9 +529,9 @@ ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB8_1: @ %vector.ph ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: eor r12, r4, #-2147483648 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r4, #4 Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -9,11 +9,11 @@ ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI0_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 @@ -80,11 +80,11 @@ ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: add.w r4, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI1_0 -; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add.w r12, r3, #4 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vmov.i32 q0, #0x14 +; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -155,8 +155,8 @@ ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI2_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vmov.i32 q2, #0x3 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 Index: llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -15,34 +15,34 @@ ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: ldr r5, [r0, #8] ; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: add.w r4, r3, r5, lsl #2 +; CHECK-NEXT: add.w r3, r3, r5, lsl #2 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsl.w r9, r5, #2 ; CHECK-NEXT: .LBB0_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: mov r7, r3 +; CHECK-NEXT: mov r4, r5 ; CHECK-NEXT: dlstp.32 lr, r5 -; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: mov r6, r5 ; CHECK-NEXT: .LBB0_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q1, [r7], #16 -; CHECK-NEXT: vldrw.u32 q2, [r3], #16 +; CHECK-NEXT: vldrw.u32 q1, [r6], #16 +; CHECK-NEXT: vldrw.u32 q2, [r7], #16 ; CHECK-NEXT: vfma.f32 q0, q2, q1 ; CHECK-NEXT: letp lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: vadd.f32 s4, s2, s3 -; CHECK-NEXT: add.w r3, r2, r0, lsl #2 +; CHECK-NEXT: add.w r7, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: add r4, r9 +; CHECK-NEXT: add r3, r9 ; CHECK-NEXT: cmp r0, r12 ; CHECK-NEXT: vadd.f32 s0, s0, s4 -; CHECK-NEXT: vstr s0, [r3] +; CHECK-NEXT: vstr s0, [r7] ; CHECK-NEXT: bne .LBB0_2 ; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} @@ -115,43 +115,35 @@ ; CHECK-NEXT: ldr.w r12, [r0, #8] ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: add.w r0, r12, #3 -; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 -; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: add.w r11, r3, r12, lsl #2 ; CHECK-NEXT: add.w r7, r3, r12, lsl #3 ; CHECK-NEXT: lsl.w r9, r12, #3 -; CHECK-NEXT: add.w r8, r4, r0, lsr #2 ; CHECK-NEXT: .LBB1_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 -; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w r11, r4, #1 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: add.w r10, r4, #1 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB1_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r10 -; CHECK-NEXT: sub.w r10, r10, #4 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q2, [r6], #16 -; CHECK-NEXT: vldrwt.u32 q3, [r3], #16 -; CHECK-NEXT: vfmat.f32 q1, q3, q2 -; CHECK-NEXT: vldrwt.u32 q3, [r0], #16 -; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q0, q3, q2 -; CHECK-NEXT: le lr, .LBB1_3 +; CHECK-NEXT: vldrw.u32 q2, [r5], #16 +; CHECK-NEXT: vldrw.u32 q3, [r3], #16 +; CHECK-NEXT: vfma.f32 q1, q3, q2 +; CHECK-NEXT: vldrw.u32 q3, [r0], #16 +; CHECK-NEXT: vfma.f32 q0, q3, q2 +; CHECK-NEXT: letp lr, .LBB1_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: vadd.f32 s8, s2, s3 -; CHECK-NEXT: add.w r0, r2, r11, lsl #2 +; CHECK-NEXT: add.w r0, r2, r10, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: add r5, r9 +; CHECK-NEXT: add r11, r9 ; CHECK-NEXT: vadd.f32 s2, s6, s7 ; CHECK-NEXT: add r7, r9 ; CHECK-NEXT: vadd.f32 s4, s4, s5 @@ -241,84 +233,86 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] +; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: subs r1, #3 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo .LBB2_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r7, [r0, #8] +; CHECK-NEXT: ldr r3, [r0, #8] ; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r0, r7, r7, lsl #1 -; CHECK-NEXT: add.w r12, r3, r7, lsl #2 -; CHECK-NEXT: add.w r1, r3, r7, lsl #3 -; CHECK-NEXT: add.w r8, r3, r0, lsl #2 -; CHECK-NEXT: adds r3, r7, #3 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r3, r3, lsl #1 +; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r1, r3, lsl #3 +; CHECK-NEXT: adds r3, #3 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: lsls r7, r0, #2 +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add.w r10, r1, r0, lsl #2 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: add.w r3, r5, r3, lsr #2 -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: lsl.w r11, r0, #2 +; CHECK-NEXT: add.w r1, r5, r3, lsr #2 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 -; CHECK-NEXT: ldrd r0, r10, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w r9, r5, #2 -; CHECK-NEXT: add.w r11, r5, #1 -; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: mov r4, r8 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: adds r0, r5, #2 +; CHECK-NEXT: adds r2, r5, #1 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: mov r4, r10 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: mov r8, r7 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB2_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r10 -; CHECK-NEXT: sub.w r10, r10, #4 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q3, [r6], #16 -; CHECK-NEXT: vldrwt.u32 q4, [r3], #16 -; CHECK-NEXT: vfmat.f32 q1, q4, q3 -; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vfmat.f32 q2, q4, q3 -; CHECK-NEXT: vldrwt.u32 q4, [r4], #16 -; CHECK-NEXT: vfmat.f32 q0, q4, q3 -; CHECK-NEXT: le lr, .LBB2_3 +; CHECK-NEXT: vldrw.u32 q3, [r6], #16 +; CHECK-NEXT: vldrw.u32 q4, [r3], #16 +; CHECK-NEXT: vfma.f32 q1, q4, q3 +; CHECK-NEXT: vldrw.u32 q4, [r0], #16 +; CHECK-NEXT: vfma.f32 q2, q4, q3 +; CHECK-NEXT: vldrw.u32 q4, [r4], #16 +; CHECK-NEXT: vfma.f32 q0, q4, q3 +; CHECK-NEXT: letp lr, .LBB2_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 ; CHECK-NEXT: vadd.f32 s12, s10, s11 -; CHECK-NEXT: add.w r0, r2, r11, lsl #2 +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: add r12, r7 +; CHECK-NEXT: add r9, r11 ; CHECK-NEXT: vadd.f32 s10, s6, s7 -; CHECK-NEXT: add r1, r7 +; CHECK-NEXT: add.w r0, r1, r2, lsl #2 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: add r8, r7 +; CHECK-NEXT: add r12, r11 ; CHECK-NEXT: vadd.f32 s6, s2, s3 +; CHECK-NEXT: add r10, r11 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s2, s8, s12 ; CHECK-NEXT: vadd.f32 s4, s4, s10 ; CHECK-NEXT: vadd.f32 s0, s0, s6 ; CHECK-NEXT: vstr s2, [r0] -; CHECK-NEXT: add.w r0, r2, r5, lsl #2 +; CHECK-NEXT: add.w r0, r1, r5, lsl #2 ; CHECK-NEXT: adds r5, #3 ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: add.w r0, r2, r9, lsl #2 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: blo .LBB2_2 ; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -410,81 +404,76 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] +; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB3_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr r2, [r0, #8] ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: add.w r0, r3, r3, lsl #1 -; CHECK-NEXT: add.w r8, r1, r3, lsl #2 -; CHECK-NEXT: add.w r12, r1, r3, lsl #3 -; CHECK-NEXT: add.w r10, r1, r3, lsl #4 -; CHECK-NEXT: add.w r9, r1, r0, lsl #2 -; CHECK-NEXT: adds r0, r3, #3 +; CHECK-NEXT: add.w r0, r2, r2, lsl #1 +; CHECK-NEXT: add.w r12, r1, r2, lsl #2 +; CHECK-NEXT: add.w r8, r1, r2, lsl #3 +; CHECK-NEXT: add.w r9, r1, r2, lsl #4 +; CHECK-NEXT: add.w r11, r1, r0, lsl #2 +; CHECK-NEXT: adds r0, r2, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: lsls r7, r3, #4 ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: add.w r0, r6, r0, lsr #2 -; CHECK-NEXT: strd r0, r3, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: lsls r0, r2, #4 +; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB3_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 ; CHECK-NEXT: adds r0, r6, #3 -; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r0, r6, #2 -; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: adds r0, r6, #1 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: ldrd r0, r11, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r5, r9 -; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: mov r4, r10 +; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: adds r0, r6, #1 +; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r5, r11 +; CHECK-NEXT: mov r4, r9 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: mov r10, r7 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB3_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r11 -; CHECK-NEXT: sub.w r11, r11, #4 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q4, [r1], #16 -; CHECK-NEXT: vldrwt.u32 q5, [r0], #16 -; CHECK-NEXT: vfmat.f32 q3, q5, q4 -; CHECK-NEXT: vldrwt.u32 q5, [r3], #16 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vfmat.f32 q2, q5, q4 -; CHECK-NEXT: vldrwt.u32 q5, [r5], #16 -; CHECK-NEXT: vfmat.f32 q1, q5, q4 -; CHECK-NEXT: vldrwt.u32 q5, [r4], #16 -; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q0, q5, q4 -; CHECK-NEXT: le lr, .LBB3_3 +; CHECK-NEXT: vldrw.u32 q4, [r1], #16 +; CHECK-NEXT: vldrw.u32 q5, [r0], #16 +; CHECK-NEXT: vfma.f32 q3, q5, q4 +; CHECK-NEXT: vldrw.u32 q5, [r3], #16 +; CHECK-NEXT: vfma.f32 q2, q5, q4 +; CHECK-NEXT: vldrw.u32 q5, [r5], #16 +; CHECK-NEXT: vfma.f32 q1, q5, q4 +; CHECK-NEXT: vldrw.u32 q5, [r4], #16 +; CHECK-NEXT: vfma.f32 q0, q5, q4 +; CHECK-NEXT: letp lr, .LBB3_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 ; CHECK-NEXT: vadd.f32 s16, s14, s15 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: add r8, r7 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s14, s10, s11 -; CHECK-NEXT: add r12, r7 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: add.w r0, r2, r0, lsl #2 +; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s10, s6, s7 -; CHECK-NEXT: add r9, r7 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: add r10, r7 ; CHECK-NEXT: vadd.f32 s6, s2, s3 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s2, s12, s16 @@ -492,20 +481,25 @@ ; CHECK-NEXT: vadd.f32 s4, s4, s10 ; CHECK-NEXT: vadd.f32 s0, s0, s6 ; CHECK-NEXT: vstr s2, [r0] -; CHECK-NEXT: add.w r0, r2, r6, lsl #2 +; CHECK-NEXT: add.w r0, r1, r6, lsl #2 ; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: vstr s8, [r0] -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r2, r0, lsl #2 +; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r2, r0, lsl #2 +; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add r12, r0 +; CHECK-NEXT: add r8, r0 +; CHECK-NEXT: add r11, r0 +; CHECK-NEXT: add r9, r0 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: blo .LBB3_2 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #40 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -636,50 +630,46 @@ ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2 +; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: add.w r10, r0, #2 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: add.w r11, r0, #1 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: ldrd r1, r11, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: add.w r10, r0, #2 -; CHECK-NEXT: adds r7, r0, #1 -; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB4_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add.w r9, r3, r5 -; CHECK-NEXT: vctp.32 r11 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q5, [r1], #16 -; CHECK-NEXT: vldrwt.u32 q6, [r3], #16 -; CHECK-NEXT: vfmat.f32 q3, q6, q5 +; CHECK-NEXT: vldrw.u32 q5, [r4], #16 +; CHECK-NEXT: vldrw.u32 q6, [r3], #16 +; CHECK-NEXT: vfma.f32 q3, q6, q5 ; CHECK-NEXT: add.w r12, r9, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q6, [r9] -; CHECK-NEXT: vfmat.f32 q4, q6, q5 -; CHECK-NEXT: sub.w r11, r11, #4 -; CHECK-NEXT: add.w r4, r12, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q6, [r12] -; CHECK-NEXT: vfmat.f32 q2, q6, q5 -; CHECK-NEXT: adds r6, r4, r5 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q6, [r4] -; CHECK-NEXT: vfmat.f32 q0, q6, q5 -; CHECK-NEXT: vldrwt.u32 q6, [r6] -; CHECK-NEXT: vfmat.f32 q1, q6, q5 -; CHECK-NEXT: le lr, .LBB4_3 +; CHECK-NEXT: vldrw.u32 q6, [r9] +; CHECK-NEXT: vfma.f32 q4, q6, q5 +; CHECK-NEXT: add.w r6, r12, r5 +; CHECK-NEXT: vldrw.u32 q6, [r12] +; CHECK-NEXT: vfma.f32 q2, q6, q5 +; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vldrw.u32 q6, [r6] +; CHECK-NEXT: vfma.f32 q0, q6, q5 +; CHECK-NEXT: vldrw.u32 q6, [r7] +; CHECK-NEXT: vfma.f32 q1, q6, q5 +; CHECK-NEXT: letp lr, .LBB4_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1 ; CHECK-NEXT: vadd.f32 s20, s18, s19 -; CHECK-NEXT: add.w r1, r2, r7, lsl #2 +; CHECK-NEXT: add.w r1, r2, r11, lsl #2 ; CHECK-NEXT: vadd.f32 s16, s16, s17 ; CHECK-NEXT: vadd.f32 s18, s14, s15 ; CHECK-NEXT: vadd.f32 s12, s12, s13 @@ -844,7 +834,7 @@ ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r8, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 @@ -861,48 +851,43 @@ ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: ldrd r1, r8, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: add.w r11, r0, #2 +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #1 -; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov q5, q1 ; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: mov r9, r7 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB5_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add.w r12, r3, r5 -; CHECK-NEXT: vctp.32 r8 -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q6, [r1], #16 -; CHECK-NEXT: vldrwt.u32 q7, [r3], #16 -; CHECK-NEXT: vfmat.f32 q4, q7, q6 +; CHECK-NEXT: vldrw.u32 q6, [r1], #16 +; CHECK-NEXT: vldrw.u32 q7, [r3], #16 +; CHECK-NEXT: vfma.f32 q4, q7, q6 ; CHECK-NEXT: add.w r10, r12, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q7, [r12] -; CHECK-NEXT: vfmat.f32 q5, q7, q6 +; CHECK-NEXT: vldrw.u32 q7, [r12] +; CHECK-NEXT: vfma.f32 q5, q7, q6 ; CHECK-NEXT: add.w r6, r10, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q7, [r10] -; CHECK-NEXT: vfmat.f32 q2, q7, q6 -; CHECK-NEXT: sub.w r8, r8, #4 +; CHECK-NEXT: vldrw.u32 q7, [r10] +; CHECK-NEXT: vfma.f32 q2, q7, q6 ; CHECK-NEXT: adds r7, r6, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q7, [r6] -; CHECK-NEXT: vfmat.f32 q0, q7, q6 +; CHECK-NEXT: vldrw.u32 q7, [r6] +; CHECK-NEXT: vfma.f32 q0, q7, q6 ; CHECK-NEXT: adds r6, r7, r5 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q7, [r7] -; CHECK-NEXT: vfmat.f32 q3, q7, q6 -; CHECK-NEXT: vldrwt.u32 q7, [r6] -; CHECK-NEXT: vfmat.f32 q1, q7, q6 -; CHECK-NEXT: le lr, .LBB5_3 +; CHECK-NEXT: vldrw.u32 q7, [r7] +; CHECK-NEXT: vfma.f32 q3, q7, q6 +; CHECK-NEXT: vldrw.u32 q7, [r6] +; CHECK-NEXT: vfma.f32 q1, q7, q6 +; CHECK-NEXT: letp lr, .LBB5_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1 ; CHECK-NEXT: vadd.f32 s24, s22, s23 @@ -940,7 +925,7 @@ ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: add r9, r1 +; CHECK-NEXT: add r8, r1 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB5_2 @@ -1090,7 +1075,7 @@ ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: add.w r9, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 @@ -1109,27 +1094,29 @@ ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: ldrd r3, r1, [sp, #16] @ 8-byte Folded Reload ; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #2 +; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: add.w r8, r0, #1 -; CHECK-NEXT: dls lr, r3 -; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vmov q6, q2 ; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: mov r12, r7 ; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB6_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add.w r10, r3, r5 -; CHECK-NEXT: vctp.32 r1 +; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q7, [r9], #16 +; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 ; CHECK-NEXT: vfmat.f32 q5, q0, q7 ; CHECK-NEXT: add.w r11, r10, r5 @@ -1159,7 +1146,7 @@ ; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r7] @@ -1215,7 +1202,7 @@ ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: add r12, r1 +; CHECK-NEXT: add r9, r1 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB6_2 @@ -1378,7 +1365,7 @@ ; CHECK-NEXT: adds r0, r3, #3 ; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r1, r3, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: lsls r5, r3, #2 @@ -1392,33 +1379,35 @@ ; CHECK-NEXT: adds r1, r0, #7 ; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: ldrd r3, r10, [sp, #16] @ 8-byte Folded Reload ; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #5 +; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: dls lr, r3 -; CHECK-NEXT: ldr.w r12, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #3 +; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill ; CHECK-NEXT: add.w r8, r0, #2 ; CHECK-NEXT: adds r1, r0, #1 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q7, q3 ; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: mov r10, r7 ; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB7_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add.w r11, r3, r5 ; CHECK-NEXT: vctp.32 r10 ; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q0, [r12], #16 +; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vfmat.f32 q6, q1, q0 ; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill @@ -1515,7 +1504,7 @@ ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: add r9, r1 +; CHECK-NEXT: add r12, r1 ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB7_2 Index: llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -574,27 +574,19 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r10, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r10, lr} -; CHECK-NEXT: add.w r7, r0, #15 ; CHECK-NEXT: ldr.w r12, [sp, #32] -; CHECK-NEXT: asrs r6, r7, #31 -; CHECK-NEXT: add.w r7, r7, r6, lsr #28 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: asrs r5, r7, #4 -; CHECK-NEXT: cmp r5, #1 -; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r6, r7, #4 ; CHECK-NEXT: cmp r0, #1 ; CHECK-NEXT: blt .LBB4_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: adds r7, r2, r1 ; CHECK-NEXT: add.w r5, r2, r1, lsl #1 ; CHECK-NEXT: add.w r1, r1, r1, lsl #1 -; CHECK-NEXT: dlstp.8 lr, r0 -; CHECK-NEXT: add r1, r2 ; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: add r1, r2 ; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: dlstp.8 lr, r0 ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u8 q0, [r3], #16 @@ -691,21 +683,21 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: add.w r12, sp, #12 +; CHECK-NEXT: .pad #28 +; CHECK-NEXT: sub sp, #28 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill +; CHECK-NEXT: str r2, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill ; CHECK-NEXT: bne .LBB5_8 ; CHECK-NEXT: @ %bb.1: @ %for.cond.preheader -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r2, [sp, #88] +; CHECK-NEXT: ldr r2, [sp, #92] ; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #72] +; CHECK-NEXT: ldr r4, [sp, #76] ; CHECK-NEXT: add.w r0, r1, r2, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r2 @@ -714,7 +706,8 @@ ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: adds r0, r2, #7 -; CHECK-NEXT: lsrs r2, r0, #3 +; CHECK-NEXT: lsrs r0, r0, #3 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: b .LBB5_5 ; CHECK-NEXT: .LBB5_3: @ in Loop: Header=BB5_5 Depth=1 ; CHECK-NEXT: mov r10, r12 @@ -723,41 +716,42 @@ ; CHECK-NEXT: .LBB5_4: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 ; CHECK-NEXT: add.w r0, r8, r10 -; CHECK-NEXT: ldr r1, [sp, #96] +; CHECK-NEXT: ldr r1, [sp, #100] ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 ; CHECK-NEXT: strb.w r0, [r1, r11] ; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r11, r0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: .LBB5_5: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_7 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: ldr r0, [sp, #96] ; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] -; CHECK-NEXT: subs r0, r2, r2 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: subs r2, r0, r0 ; CHECK-NEXT: ble .LBB5_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: ldr.w r9, [sp, #88] +; CHECK-NEXT: ldr.w lr, [sp, #92] ; CHECK-NEXT: mov r6, r12 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: dlstp.16 lr, r9 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mla r3, r11, r9, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldrd r7, r5, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mla r3, r11, lr, r0 +; CHECK-NEXT: mov r9, lr +; CHECK-NEXT: ldm.w sp, {r0, r5, r7} @ 12-byte Folded Reload +; CHECK-NEXT: dlstp.16 lr, lr ; CHECK-NEXT: .LBB5_7: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB5_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrb.s16 q0, [r7], #8 +; CHECK-NEXT: vldrb.s16 q0, [r0], #8 ; CHECK-NEXT: vadd.i16 q1, q0, r4 ; CHECK-NEXT: vldrb.s16 q0, [r3], #8 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r0], #8 +; CHECK-NEXT: vldrb.s16 q1, [r7], #8 ; CHECK-NEXT: vadd.i16 q1, q1, r4 ; CHECK-NEXT: vmlava.s16 r6, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 @@ -769,8 +763,8 @@ ; CHECK-NEXT: letp lr, .LBB5_7 ; CHECK-NEXT: b .LBB5_4 ; CHECK-NEXT: .LBB5_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #96] -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: ldr r0, [sp, #100] +; CHECK-NEXT: add sp, #28 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 @@ -879,21 +873,21 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: add.w r12, sp, #12 +; CHECK-NEXT: .pad #28 +; CHECK-NEXT: sub sp, #28 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill +; CHECK-NEXT: str r2, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill ; CHECK-NEXT: bne .LBB6_8 ; CHECK-NEXT: @ %bb.1: @ %for.cond.preheader -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r2, [sp, #88] +; CHECK-NEXT: ldr r2, [sp, #92] ; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #72] +; CHECK-NEXT: ldr r4, [sp, #76] ; CHECK-NEXT: add.w r0, r1, r2, lsl #1 ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: adds r0, r1, r2 @@ -902,34 +896,36 @@ ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: adds r0, r2, #7 -; CHECK-NEXT: lsrs r2, r0, #3 +; CHECK-NEXT: lsrs r0, r0, #3 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: .LBB6_3: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_5 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: ldr r0, [sp, #96] ; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] -; CHECK-NEXT: subs r0, r2, r2 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: subs r2, r0, r0 ; CHECK-NEXT: ble .LBB6_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: ldr.w r9, [sp, #88] +; CHECK-NEXT: ldr.w lr, [sp, #92] ; CHECK-NEXT: mov r6, r12 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: dlstp.16 lr, r9 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mla r3, r11, r9, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldrd r7, r5, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mla r3, r11, lr, r0 +; CHECK-NEXT: mov r9, lr +; CHECK-NEXT: ldm.w sp, {r0, r5, r7} @ 12-byte Folded Reload +; CHECK-NEXT: dlstp.16 lr, lr ; CHECK-NEXT: .LBB6_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB6_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrb.s16 q0, [r7], #8 +; CHECK-NEXT: vldrb.s16 q0, [r0], #8 ; CHECK-NEXT: vadd.i16 q1, q0, r4 ; CHECK-NEXT: vldrb.s16 q0, [r3], #8 ; CHECK-NEXT: vmlava.s16 r12, q0, q1 -; CHECK-NEXT: vldrb.s16 q1, [r0], #8 +; CHECK-NEXT: vldrb.s16 q1, [r7], #8 ; CHECK-NEXT: vadd.i16 q1, q1, r4 ; CHECK-NEXT: vmlava.s16 r6, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 @@ -947,17 +943,17 @@ ; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 ; CHECK-NEXT: add.w r0, r8, r10 -; CHECK-NEXT: ldr r1, [sp, #96] +; CHECK-NEXT: ldr r1, [sp, #100] ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 ; CHECK-NEXT: strb.w r0, [r1, r11] ; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: cmp r11, r0 ; CHECK-NEXT: bne .LBB6_3 ; CHECK-NEXT: .LBB6_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #96] -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: ldr r0, [sp, #100] +; CHECK-NEXT: add sp, #28 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 Index: llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -14,8 +14,8 @@ ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vldrw.u32 q1, [r4] ; CHECK-NEXT: vmov.i32 q3, #0x4 -; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16 Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -2330,8 +2330,8 @@ ; CHECK-NEXT: cbz r1, .LBB29_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB29_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -2383,8 +2383,8 @@ ; CHECK-NEXT: cbz r2, .LBB30_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB30_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -2442,8 +2442,8 @@ ; CHECK-NEXT: cbz r2, .LBB31_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB31_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16