diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -634,6 +634,11 @@ Opc == ARM::t2BR_JT; } +static inline bool isLowOverheadTerminatorOpcode(int Opc) { + return Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart || + Opc == ARM::t2LoopEnd || Opc == ARM::t2LoopEndDec; +} + static inline bool isIndirectBranchOpcode(int Opc) { return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -374,7 +374,8 @@ } if (isIndirectBranchOpcode(I->getOpcode()) || - isJumpTableBranchOpcode(I->getOpcode())) { + isJumpTableBranchOpcode(I->getOpcode()) || + isLowOverheadTerminatorOpcode(I->getOpcode())) { // Indirect branches and jump tables can't be analyzed, but we still want // to clean up any instructions at the tail of the basic block. CantAnalyze = true; diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -330,9 +330,9 @@ ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vdup.32 q2, r12 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: b .LBB2_4 +; CHECK-NEXT: b .LBB2_5 ; CHECK-NEXT: .LBB2_2: @ %cond.load25 -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vmovx.f16 s0, s28 ; CHECK-NEXT: vmov r4, s28 ; CHECK-NEXT: vmov r2, s0 @@ -344,7 +344,7 @@ ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q6[3], r2 ; CHECK-NEXT: .LBB2_3: @ %else26 -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vmul.f16 q0, q6, q5 ; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: vcvtt.f32.f16 s23, s1 @@ -355,9 +355,18 @@ ; CHECK-NEXT: vcvtb.f32.f16 s20, s0 ; CHECK-NEXT: vadd.f32 q5, q3, q5 ; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: bne .LBB2_4 -; CHECK-NEXT: b .LBB2_21 -; CHECK-NEXT: .LBB2_4: @ %vector.body +; CHECK-NEXT: bne .LBB2_5 +; CHECK-NEXT: @ %bb.4: @ %middle.block +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vcmp.u32 cs, q0, q4 +; CHECK-NEXT: vpsel q0, q5, q3 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vadd.f32 q0, q0, r0 +; CHECK-NEXT: b .LBB2_23 +; CHECK-NEXT: .LBB2_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov q3, q5 @@ -379,13 +388,13 @@ ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: bfi r2, r4, #3, #1 ; CHECK-NEXT: lsls r4, r2, #31 -; CHECK-NEXT: bne .LBB2_9 -; CHECK-NEXT: @ %bb.5: @ %else -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bne .LBB2_10 +; CHECK-NEXT: @ %bb.6: @ %else +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: lsls r4, r2, #30 -; CHECK-NEXT: bpl .LBB2_10 -; CHECK-NEXT: .LBB2_6: @ %cond.load6 -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bpl .LBB2_11 +; CHECK-NEXT: .LBB2_7: @ %cond.load6 +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vldr.16 s20, [r0, #2] ; CHECK-NEXT: vmov r5, s24 ; CHECK-NEXT: vmovx.f16 s24, s25 @@ -397,25 +406,25 @@ ; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: vmov.16 q5[3], r4 ; CHECK-NEXT: lsls r4, r2, #29 -; CHECK-NEXT: bmi .LBB2_11 -; CHECK-NEXT: .LBB2_7: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bmi .LBB2_12 +; CHECK-NEXT: .LBB2_8: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: bmi .LBB2_12 -; CHECK-NEXT: .LBB2_8: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bmi .LBB2_13 +; CHECK-NEXT: .LBB2_9: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: b .LBB2_13 -; CHECK-NEXT: .LBB2_9: @ %cond.load -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: b .LBB2_14 +; CHECK-NEXT: .LBB2_10: @ %cond.load +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vldr.16 s24, [r0] ; CHECK-NEXT: lsls r4, r2, #30 -; CHECK-NEXT: bmi .LBB2_6 -; CHECK-NEXT: .LBB2_10: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bmi .LBB2_7 +; CHECK-NEXT: .LBB2_11: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: lsls r4, r2, #29 -; CHECK-NEXT: bpl .LBB2_7 -; CHECK-NEXT: .LBB2_11: @ %cond.load9 -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bpl .LBB2_8 +; CHECK-NEXT: .LBB2_12: @ %cond.load9 +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vmovx.f16 s24, s20 ; CHECK-NEXT: vmov r4, s20 ; CHECK-NEXT: vldr.16 s28, [r0, #4] @@ -428,9 +437,9 @@ ; CHECK-NEXT: vmov r4, s20 ; CHECK-NEXT: vmov.16 q6[3], r4 ; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: bpl .LBB2_8 -; CHECK-NEXT: .LBB2_12: @ %cond.load12 -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bpl .LBB2_9 +; CHECK-NEXT: .LBB2_13: @ %cond.load12 +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vmovx.f16 s20, s24 ; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: vmov r2, s20 @@ -441,8 +450,8 @@ ; CHECK-NEXT: vmov.16 q5[2], r2 ; CHECK-NEXT: vmov r2, s24 ; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: .LBB2_13: @ %else13 -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: .LBB2_14: @ %else13 +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vcmp.u32 cs, q2, q4 ; CHECK-NEXT: @ implicit-def: $q7 ; CHECK-NEXT: vmrs r4, p0 @@ -460,13 +469,13 @@ ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: bfi r2, r4, #3, #1 ; CHECK-NEXT: lsls r4, r2, #31 -; CHECK-NEXT: bne .LBB2_17 -; CHECK-NEXT: @ %bb.14: @ %else17 -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bne .LBB2_18 +; CHECK-NEXT: @ %bb.15: @ %else17 +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: lsls r4, r2, #30 -; CHECK-NEXT: bpl .LBB2_18 -; CHECK-NEXT: .LBB2_15: @ %cond.load19 -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bpl .LBB2_19 +; CHECK-NEXT: .LBB2_16: @ %cond.load19 +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vldr.16 s24, [r1, #2] ; CHECK-NEXT: vmov r5, s28 ; CHECK-NEXT: vmovx.f16 s28, s29 @@ -478,23 +487,23 @@ ; CHECK-NEXT: vmov r4, s28 ; CHECK-NEXT: vmov.16 q6[3], r4 ; CHECK-NEXT: lsls r4, r2, #29 -; CHECK-NEXT: bmi .LBB2_19 -; CHECK-NEXT: .LBB2_16: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bmi .LBB2_20 +; CHECK-NEXT: .LBB2_17: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bmi.w .LBB2_2 -; CHECK-NEXT: b .LBB2_20 -; CHECK-NEXT: .LBB2_17: @ %cond.load16 -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: b .LBB2_21 +; CHECK-NEXT: .LBB2_18: @ %cond.load16 +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vldr.16 s28, [r1] ; CHECK-NEXT: lsls r4, r2, #30 -; CHECK-NEXT: bmi .LBB2_15 -; CHECK-NEXT: .LBB2_18: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bmi .LBB2_16 +; CHECK-NEXT: .LBB2_19: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vmov q6, q7 ; CHECK-NEXT: lsls r4, r2, #29 -; CHECK-NEXT: bpl .LBB2_16 -; CHECK-NEXT: .LBB2_19: @ %cond.load22 -; CHECK-NEXT: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: bpl .LBB2_17 +; CHECK-NEXT: .LBB2_20: @ %cond.load22 +; CHECK-NEXT: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vmovx.f16 s28, s24 ; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: vldr.16 s0, [r1, #4] @@ -508,19 +517,9 @@ ; CHECK-NEXT: vmov.16 q7[3], r4 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bmi.w .LBB2_2 -; CHECK-NEXT: .LBB2_20: @ in Loop: Header=BB2_4 Depth=1 +; CHECK-NEXT: .LBB2_21: @ in Loop: Header=BB2_5 Depth=1 ; CHECK-NEXT: vmov q6, q7 ; CHECK-NEXT: b .LBB2_3 -; CHECK-NEXT: .LBB2_21: @ %middle.block -; CHECK-NEXT: vdup.32 q0, r12 -; CHECK-NEXT: vcmp.u32 cs, q0, q4 -; CHECK-NEXT: vpsel q0, q5, q3 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vadd.f32 q0, q0, r0 -; CHECK-NEXT: b .LBB2_23 ; CHECK-NEXT: .LBB2_22: ; CHECK-NEXT: vldr s0, .LCPI2_0 ; CHECK-NEXT: .LBB2_23: @ %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -1468,7 +1468,7 @@ ; CHECK-NEXT: b .LBB9_6 ; CHECK-NEXT: .LBB9_3: ; CHECK-NEXT: vldr s0, .LCPI9_0 -; CHECK-NEXT: b .LBB9_9 +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new ; CHECK-NEXT: bic r2, r2, #3 ; CHECK-NEXT: movs r3, #1 @@ -1625,7 +1625,7 @@ ; CHECK-NEXT: b .LBB10_6 ; CHECK-NEXT: .LBB10_3: ; CHECK-NEXT: vldr s0, .LCPI10_0 -; CHECK-NEXT: b .LBB10_9 +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new ; CHECK-NEXT: bic r2, r2, #3 ; CHECK-NEXT: movs r3, #1 @@ -1782,7 +1782,7 @@ ; CHECK-NEXT: b .LBB11_6 ; CHECK-NEXT: .LBB11_3: ; CHECK-NEXT: vldr s0, .LCPI11_0 -; CHECK-NEXT: b .LBB11_9 +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new ; CHECK-NEXT: bic r2, r2, #3 ; CHECK-NEXT: movs r3, #1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll @@ -12,41 +12,43 @@ ; CHECK-NEXT: lsl.w r12, r3, #1 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: mov r4, r1 -; CHECK-NEXT: .LBB0_2: @ %for.cond1.preheader.us +; CHECK-NEXT: b .LBB0_4 +; CHECK-NEXT: .LBB0_2: @ %for.body15.us +; CHECK-NEXT: @ Parent Loop BB0_4 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldrh.w r7, [r0, r6, lsl #1] +; CHECK-NEXT: ldrh.w r5, [r1, r6, lsl #1] +; CHECK-NEXT: add r5, r7 +; CHECK-NEXT: strh.w r5, [r2, r6, lsl #1] +; CHECK-NEXT: adds r6, #1 +; CHECK-NEXT: le lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup14.us +; CHECK-NEXT: @ in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: adds r3, #1 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: add r4, r12 +; CHECK-NEXT: cmp r3, r8 +; CHECK-NEXT: beq .LBB0_7 +; CHECK-NEXT: .LBB0_4: @ %for.cond1.preheader.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 ; CHECK-NEXT: @ Child Loop BB0_5 Depth 2 +; CHECK-NEXT: @ Child Loop BB0_2 Depth 2 ; CHECK-NEXT: dls lr, r8 ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: .LBB0_3: @ %for.body4.us -; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 +; CHECK-NEXT: .LBB0_5: @ %for.body4.us +; CHECK-NEXT: @ Parent Loop BB0_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrh.w r5, [r0, r6, lsl #1] ; CHECK-NEXT: ldrh.w r7, [r1, r6, lsl #1] ; CHECK-NEXT: add r5, r7 ; CHECK-NEXT: strh.w r5, [r4, r6, lsl #1] ; CHECK-NEXT: adds r6, #1 -; CHECK-NEXT: le lr, .LBB0_3 -; CHECK-NEXT: @ %bb.4: @ %for.body15.us.preheader -; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: le lr, .LBB0_5 +; CHECK-NEXT: @ %bb.6: @ %for.body15.us.preheader +; CHECK-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: dls lr, r8 ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: .LBB0_5: @ %for.body15.us -; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 -; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh.w r7, [r0, r6, lsl #1] -; CHECK-NEXT: ldrh.w r5, [r1, r6, lsl #1] -; CHECK-NEXT: add r5, r7 -; CHECK-NEXT: strh.w r5, [r2, r6, lsl #1] -; CHECK-NEXT: adds r6, #1 -; CHECK-NEXT: le lr, .LBB0_5 -; CHECK-NEXT: @ %bb.6: @ %for.cond.cleanup14.us -; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: adds r3, #1 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: add r4, r12 -; CHECK-NEXT: cmp r3, r8 -; CHECK-NEXT: bne .LBB0_2 +; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_7: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -30,11 +30,28 @@ ; ENABLED-NEXT: mov r9, r12 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 -; ENABLED-NEXT: b .LBB0_4 -; ENABLED-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1 -; ENABLED-NEXT: movs r0, #0 -; ENABLED-NEXT: .LBB0_3: @ %for.end -; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 +; ENABLED-NEXT: b .LBB0_5 +; ENABLED-NEXT: .LBB0_2: @ %vector.body +; ENABLED-NEXT: @ Parent Loop BB0_5 Depth=1 +; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 +; ENABLED-NEXT: vctp.32 r4 +; ENABLED-NEXT: vmov q0, q1 +; ENABLED-NEXT: vpstt +; ENABLED-NEXT: vldrht.s32 q1, [r0], #8 +; ENABLED-NEXT: vldrht.s32 q2, [r7], #8 +; ENABLED-NEXT: mov lr, r6 +; ENABLED-NEXT: vmul.i32 q1, q2, q1 +; ENABLED-NEXT: subs r6, #1 +; ENABLED-NEXT: vshl.s32 q1, r5 +; ENABLED-NEXT: subs r4, #4 +; ENABLED-NEXT: vadd.i32 q1, q1, q0 +; ENABLED-NEXT: le lr, .LBB0_2 +; ENABLED-NEXT: @ %bb.3: @ %middle.block +; ENABLED-NEXT: @ in Loop: Header=BB0_5 Depth=1 +; ENABLED-NEXT: vpsel q0, q1, q0 +; ENABLED-NEXT: vaddv.u32 r0, q0 +; ENABLED-NEXT: .LBB0_4: @ %for.end +; ENABLED-NEXT: @ in Loop: Header=BB0_5 Depth=1 ; ENABLED-NEXT: lsrs r0, r0, #16 ; ENABLED-NEXT: sub.w r9, r9, #1 ; ENABLED-NEXT: strh.w r0, [r1, r8, lsl #1] @@ -42,13 +59,13 @@ ; ENABLED-NEXT: add.w r10, r10, #2 ; ENABLED-NEXT: cmp r8, r3 ; ENABLED-NEXT: beq .LBB0_8 -; ENABLED-NEXT: .LBB0_4: @ %for.body +; ENABLED-NEXT: .LBB0_5: @ %for.body ; ENABLED-NEXT: @ =>This Loop Header: Depth=1 -; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2 +; ENABLED-NEXT: @ Child Loop BB0_2 Depth 2 ; ENABLED-NEXT: cmp r2, r8 -; ENABLED-NEXT: ble .LBB0_2 -; ENABLED-NEXT: @ %bb.5: @ %vector.ph -; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 +; ENABLED-NEXT: ble .LBB0_7 +; ENABLED-NEXT: @ %bb.6: @ %vector.ph +; ENABLED-NEXT: @ in Loop: Header=BB0_5 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 ; ENABLED-NEXT: movs r7, #1 ; ENABLED-NEXT: subs r0, #4 @@ -62,26 +79,10 @@ ; ENABLED-NEXT: mov r7, r10 ; ENABLED-NEXT: dls lr, r0 ; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload -; ENABLED-NEXT: .LBB0_6: @ %vector.body -; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 -; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; ENABLED-NEXT: vctp.32 r4 -; ENABLED-NEXT: vmov q0, q1 -; ENABLED-NEXT: vpstt -; ENABLED-NEXT: vldrht.s32 q1, [r0], #8 -; ENABLED-NEXT: vldrht.s32 q2, [r7], #8 -; ENABLED-NEXT: mov lr, r6 -; ENABLED-NEXT: vmul.i32 q1, q2, q1 -; ENABLED-NEXT: subs r6, #1 -; ENABLED-NEXT: vshl.s32 q1, r5 -; ENABLED-NEXT: subs r4, #4 -; ENABLED-NEXT: vadd.i32 q1, q1, q0 -; ENABLED-NEXT: le lr, .LBB0_6 -; ENABLED-NEXT: @ %bb.7: @ %middle.block -; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; ENABLED-NEXT: vpsel q0, q1, q0 -; ENABLED-NEXT: vaddv.u32 r0, q0 -; ENABLED-NEXT: b .LBB0_3 +; ENABLED-NEXT: b .LBB0_2 +; ENABLED-NEXT: .LBB0_7: @ in Loop: Header=BB0_5 Depth=1 +; ENABLED-NEXT: movs r0, #0 +; ENABLED-NEXT: b .LBB0_4 ; ENABLED-NEXT: .LBB0_8: @ %for.end17 ; ENABLED-NEXT: add sp, #4 ; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} @@ -101,11 +102,28 @@ ; NOREDUCTIONS-NEXT: mov r9, r12 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 -; NOREDUCTIONS-NEXT: b .LBB0_4 -; NOREDUCTIONS-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1 -; NOREDUCTIONS-NEXT: movs r0, #0 -; NOREDUCTIONS-NEXT: .LBB0_3: @ %for.end -; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 +; NOREDUCTIONS-NEXT: b .LBB0_5 +; NOREDUCTIONS-NEXT: .LBB0_2: @ %vector.body +; NOREDUCTIONS-NEXT: @ Parent Loop BB0_5 Depth=1 +; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 +; NOREDUCTIONS-NEXT: vctp.32 r4 +; NOREDUCTIONS-NEXT: vmov q0, q1 +; NOREDUCTIONS-NEXT: vpstt +; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8 +; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8 +; NOREDUCTIONS-NEXT: mov lr, r6 +; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1 +; NOREDUCTIONS-NEXT: subs r6, #1 +; NOREDUCTIONS-NEXT: vshl.s32 q1, r5 +; NOREDUCTIONS-NEXT: subs r4, #4 +; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0 +; NOREDUCTIONS-NEXT: le lr, .LBB0_2 +; NOREDUCTIONS-NEXT: @ %bb.3: @ %middle.block +; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_5 Depth=1 +; NOREDUCTIONS-NEXT: vpsel q0, q1, q0 +; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 +; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.end +; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_5 Depth=1 ; NOREDUCTIONS-NEXT: lsrs r0, r0, #16 ; NOREDUCTIONS-NEXT: sub.w r9, r9, #1 ; NOREDUCTIONS-NEXT: strh.w r0, [r1, r8, lsl #1] @@ -113,13 +131,13 @@ ; NOREDUCTIONS-NEXT: add.w r10, r10, #2 ; NOREDUCTIONS-NEXT: cmp r8, r3 ; NOREDUCTIONS-NEXT: beq .LBB0_8 -; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body +; NOREDUCTIONS-NEXT: .LBB0_5: @ %for.body ; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1 -; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2 +; NOREDUCTIONS-NEXT: @ Child Loop BB0_2 Depth 2 ; NOREDUCTIONS-NEXT: cmp r2, r8 -; NOREDUCTIONS-NEXT: ble .LBB0_2 -; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph -; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 +; NOREDUCTIONS-NEXT: ble .LBB0_7 +; NOREDUCTIONS-NEXT: @ %bb.6: @ %vector.ph +; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_5 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 ; NOREDUCTIONS-NEXT: movs r7, #1 ; NOREDUCTIONS-NEXT: subs r0, #4 @@ -133,26 +151,10 @@ ; NOREDUCTIONS-NEXT: mov r7, r10 ; NOREDUCTIONS-NEXT: dls lr, r0 ; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload -; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body -; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 -; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 -; NOREDUCTIONS-NEXT: vctp.32 r4 -; NOREDUCTIONS-NEXT: vmov q0, q1 -; NOREDUCTIONS-NEXT: vpstt -; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8 -; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8 -; NOREDUCTIONS-NEXT: mov lr, r6 -; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1 -; NOREDUCTIONS-NEXT: subs r6, #1 -; NOREDUCTIONS-NEXT: vshl.s32 q1, r5 -; NOREDUCTIONS-NEXT: subs r4, #4 -; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0 -; NOREDUCTIONS-NEXT: le lr, .LBB0_6 -; NOREDUCTIONS-NEXT: @ %bb.7: @ %middle.block -; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; NOREDUCTIONS-NEXT: vpsel q0, q1, q0 -; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 -; NOREDUCTIONS-NEXT: b .LBB0_3 +; NOREDUCTIONS-NEXT: b .LBB0_2 +; NOREDUCTIONS-NEXT: .LBB0_7: @ in Loop: Header=BB0_5 Depth=1 +; NOREDUCTIONS-NEXT: movs r0, #0 +; NOREDUCTIONS-NEXT: b .LBB0_4 ; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17 ; NOREDUCTIONS-NEXT: add sp, #4 ; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll @@ -66,8 +66,7 @@ ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB1_4: -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp7 = icmp sgt i32 %len, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -60,39 +60,39 @@ ; CHECK-NEXT: ldr r5, [sp, #28] ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: b .LBB1_4 -; CHECK-NEXT: .LBB1_2: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: .LBB1_3: @ %if.end -; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: str.w r4, [r2, r1, lsl #2] -; CHECK-NEXT: adds r1, #1 -; CHECK-NEXT: cmp r1, r3 -; CHECK-NEXT: beq .LBB1_8 -; CHECK-NEXT: .LBB1_4: @ %for.body -; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB1_6 Depth 2 -; CHECK-NEXT: adds r7, r5, #3 -; CHECK-NEXT: cmp.w r12, r7, lsr #2 -; CHECK-NEXT: beq .LBB1_2 -; CHECK-NEXT: @ %bb.5: @ %do.body.preheader -; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 +; CHECK-NEXT: b .LBB1_6 +; CHECK-NEXT: .LBB1_2: @ %do.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 ; CHECK-NEXT: bic r9, r7, #3 ; CHECK-NEXT: mov r7, r5 ; CHECK-NEXT: mov r4, r3 ; CHECK-NEXT: add.w r8, r0, r9, lsl #2 ; CHECK-NEXT: dlstp.32 lr, r5 -; CHECK-NEXT: .LBB1_6: @ %do.body -; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 +; CHECK-NEXT: .LBB1_3: @ %do.body +; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vaddva.s32 r4, q0 -; CHECK-NEXT: letp lr, .LBB1_6 -; CHECK-NEXT: @ %bb.7: @ %if.end.loopexit -; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 +; CHECK-NEXT: letp lr, .LBB1_3 +; CHECK-NEXT: @ %bb.4: @ %if.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 ; CHECK-NEXT: sub.w r5, r5, r9 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: b .LBB1_3 +; CHECK-NEXT: .LBB1_5: @ %if.end +; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 +; CHECK-NEXT: str.w r4, [r2, r1, lsl #2] +; CHECK-NEXT: adds r1, #1 +; CHECK-NEXT: cmp r1, r3 +; CHECK-NEXT: beq .LBB1_8 +; CHECK-NEXT: .LBB1_6: @ %for.body +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 +; CHECK-NEXT: adds r7, r5, #3 +; CHECK-NEXT: cmp.w r12, r7, lsr #2 +; CHECK-NEXT: bne .LBB1_2 +; CHECK-NEXT: @ %bb.7: @ in Loop: Header=BB1_6 Depth=1 +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: b .LBB1_5 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -1102,9 +1102,20 @@ ; CHECK-NEXT: add.w r3, r12, #16 ; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: b .LBB16_4 -; CHECK-NEXT: .LBB16_3: @ %while.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: b .LBB16_6 +; CHECK-NEXT: .LBB16_3: @ %while.body76 +; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldrh r1, [r6], #2 +; CHECK-NEXT: vldrh.u16 q1, [r0], #2 +; CHECK-NEXT: subs.w lr, lr, #1 +; CHECK-NEXT: vfma.f16 q0, q1, r1 +; CHECK-NEXT: bne .LBB16_3 +; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: add.w r5, r5, r8, lsl #1 +; CHECK-NEXT: .LBB16_5: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: subs.w r9, r9, #1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload @@ -1112,10 +1123,10 @@ ; CHECK-NEXT: add.w r0, r5, r0, lsl #1 ; CHECK-NEXT: add.w r5, r0, #8 ; CHECK-NEXT: beq.w .LBB16_12 -; CHECK-NEXT: .LBB16_4: @ %while.body +; CHECK-NEXT: .LBB16_6: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 -; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_8 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 ; CHECK-NEXT: vldrw.u32 q0, [r1], #8 ; CHECK-NEXT: ldrh.w lr, [r12, #14] ; CHECK-NEXT: ldrh.w r0, [r12, #12] @@ -1152,14 +1163,14 @@ ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: vfma.f16 q0, q1, lr ; CHECK-NEXT: cmp r0, #16 -; CHECK-NEXT: blo .LBB16_7 -; CHECK-NEXT: @ %bb.5: @ %for.body.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: blo .LBB16_11 +; CHECK-NEXT: @ %bb.7: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 ; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: .LBB16_6: @ %for.body -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: .LBB16_8: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrh r0, [r6], #16 ; CHECK-NEXT: vldrw.u32 q1, [r5] @@ -1190,32 +1201,19 @@ ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r1 -; CHECK-NEXT: le lr, .LBB16_6 -; CHECK-NEXT: b .LBB16_8 -; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: .LBB16_8: @ %for.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: le lr, .LBB16_8 +; CHECK-NEXT: .LBB16_9: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 ; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: beq.w .LBB16_3 -; CHECK-NEXT: b .LBB16_9 -; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: beq.w .LBB16_5 +; CHECK-NEXT: @ %bb.10: @ %while.body76.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov lr, r8 -; CHECK-NEXT: .LBB16_10: @ %while.body76 -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 -; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r1, [r6], #2 -; CHECK-NEXT: vldrh.u16 q1, [r0], #2 -; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: vfma.f16 q0, q1, r1 -; CHECK-NEXT: bne .LBB16_10 -; CHECK-NEXT: b .LBB16_11 -; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: add.w r5, r5, r8, lsl #1 ; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_11: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: b .LBB16_9 ; CHECK-NEXT: .LBB16_12: @ %if.end ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1074,19 +1074,30 @@ ; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: b .LBB16_4 -; CHECK-NEXT: .LBB16_3: @ %while.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: b .LBB16_6 +; CHECK-NEXT: .LBB16_3: @ %while.body76 +; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldr r0, [r7], #4 +; CHECK-NEXT: vldrw.u32 q1, [r6], #4 +; CHECK-NEXT: subs.w lr, lr, #1 +; CHECK-NEXT: vfma.f32 q0, q1, r0 +; CHECK-NEXT: bne .LBB16_3 +; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: add.w r5, r5, r3, lsl #2 +; CHECK-NEXT: .LBB16_5: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: subs.w r10, r10, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: add.w r0, r5, r0, lsl #2 ; CHECK-NEXT: add.w r5, r0, #16 ; CHECK-NEXT: beq .LBB16_12 -; CHECK-NEXT: .LBB16_4: @ %while.body +; CHECK-NEXT: .LBB16_6: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 -; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_8 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: ldrd r7, r6, [r12] ; CHECK-NEXT: ldrd r0, r4, [r12, #8] @@ -1112,14 +1123,14 @@ ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: vfma.f32 q0, q1, r8 -; CHECK-NEXT: blo .LBB16_7 -; CHECK-NEXT: @ %bb.5: @ %for.body.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: blo .LBB16_11 +; CHECK-NEXT: @ %bb.7: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: .LBB16_6: @ %for.body -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: .LBB16_8: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldm.w r7, {r0, r3, r4, r6} ; CHECK-NEXT: vldrw.u32 q1, [r5], #32 @@ -1142,34 +1153,21 @@ ; CHECK-NEXT: adds r7, #32 ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: vfma.f32 q0, q1, r9 -; CHECK-NEXT: le lr, .LBB16_6 -; CHECK-NEXT: b .LBB16_8 -; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: .LBB16_8: @ %for.end -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: le lr, .LBB16_8 +; CHECK-NEXT: .LBB16_9: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 ; CHECK-NEXT: ldrd r9, r1, [sp, #24] @ 8-byte Folded Reload ; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: cmp.w r3, #0 -; CHECK-NEXT: beq .LBB16_3 -; CHECK-NEXT: b .LBB16_9 -; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: beq .LBB16_5 +; CHECK-NEXT: @ %bb.10: @ %while.body76.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 ; CHECK-NEXT: mov r6, r5 ; CHECK-NEXT: mov lr, r3 -; CHECK-NEXT: .LBB16_10: @ %while.body76 -; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 -; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr r0, [r7], #4 -; CHECK-NEXT: vldrw.u32 q1, [r6], #4 -; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: vfma.f32 q0, q1, r0 -; CHECK-NEXT: bne .LBB16_10 -; CHECK-NEXT: b .LBB16_11 -; CHECK-NEXT: .LBB16_11: @ %while.end.loopexit -; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 -; CHECK-NEXT: add.w r5, r5, r3, lsl #2 ; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_11: @ in Loop: Header=BB16_6 Depth=1 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: b .LBB16_9 ; CHECK-NEXT: .LBB16_12: @ %if.end ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} @@ -1581,25 +1579,27 @@ ; CHECK-NEXT: @ %bb.1: @ %do.body.preheader ; CHECK-NEXT: ldr.w r12, [sp, #20] ; CHECK-NEXT: lsr.w r5, lr, #2 -; CHECK-NEXT: .LBB18_2: @ %do.body -; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB18_3 Depth 2 -; CHECK-NEXT: ldr r4, [r2] -; CHECK-NEXT: dls lr, r5 -; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: .LBB18_3: @ %while.body -; CHECK-NEXT: @ Parent Loop BB18_2 Depth=1 +; CHECK-NEXT: b .LBB18_4 +; CHECK-NEXT: .LBB18_2: @ %while.body +; CHECK-NEXT: @ Parent Loop BB18_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 ; CHECK-NEXT: vfms.f32 q2, q1, q0 ; CHECK-NEXT: vstrb.8 q2, [r3], #16 -; CHECK-NEXT: le lr, .LBB18_3 -; CHECK-NEXT: @ %bb.4: @ %while.end -; CHECK-NEXT: @ in Loop: Header=BB18_2 Depth=1 +; CHECK-NEXT: le lr, .LBB18_2 +; CHECK-NEXT: @ %bb.3: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB18_4 Depth=1 ; CHECK-NEXT: subs.w r12, r12, #1 ; CHECK-NEXT: add.w r2, r2, #4 -; CHECK-NEXT: bne .LBB18_2 +; CHECK-NEXT: beq .LBB18_5 +; CHECK-NEXT: .LBB18_4: @ %do.body +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB18_2 Depth 2 +; CHECK-NEXT: ldr r4, [r2] +; CHECK-NEXT: dls lr, r5 +; CHECK-NEXT: vdup.32 q0, r4 +; CHECK-NEXT: b .LBB18_2 ; CHECK-NEXT: .LBB18_5: @ %do.end ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -325,23 +325,25 @@ ; CHECK-NEXT: adr r3, .LCPI8_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: .LBB8_2: @ %vector.ph -; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB8_3 Depth 2 -; CHECK-NEXT: dls lr, r4 -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: .LBB8_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB8_2 Depth=1 +; CHECK-NEXT: b .LBB8_4 +; CHECK-NEXT: .LBB8_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB8_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q2, [q1, #16]! ; CHECK-NEXT: vstrb.8 q2, [r0], #16 -; CHECK-NEXT: le lr, .LBB8_3 -; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB8_2 Depth=1 +; CHECK-NEXT: le lr, .LBB8_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB8_4 Depth=1 ; CHECK-NEXT: cmp r12, r2 -; CHECK-NEXT: bne .LBB8_2 -; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup +; CHECK-NEXT: beq .LBB8_5 +; CHECK-NEXT: .LBB8_4: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB8_2 Depth 2 +; CHECK-NEXT: dls lr, r4 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: b .LBB8_2 +; CHECK-NEXT: .LBB8_5: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: @@ -402,16 +404,9 @@ ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: .LBB9_2: @ %vector.ph -; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB9_3 Depth 2 -; CHECK-NEXT: dls lr, r3 -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: .LBB9_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1 +; CHECK-NEXT: b .LBB9_4 +; CHECK-NEXT: .LBB9_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB9_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q6, [q5, #48]! ; CHECK-NEXT: vldrw.u32 q7, [q3, #48]! @@ -419,11 +414,20 @@ ; CHECK-NEXT: vldrw.u32 q7, [q4, #48]! ; CHECK-NEXT: vadd.i32 q6, q6, q7 ; CHECK-NEXT: vstrb.8 q6, [r0], #16 -; CHECK-NEXT: le lr, .LBB9_3 -; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=1 +; CHECK-NEXT: le lr, .LBB9_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB9_4 Depth=1 ; CHECK-NEXT: cmp r12, r2 -; CHECK-NEXT: bne .LBB9_2 +; CHECK-NEXT: beq .LBB9_5 +; CHECK-NEXT: .LBB9_4: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB9_2 Depth 2 +; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov q5, q2 +; CHECK-NEXT: b .LBB9_2 ; CHECK-NEXT: .LBB9_5: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} @@ -498,23 +502,25 @@ ; CHECK-NEXT: adr r3, .LCPI10_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: .LBB10_2: @ %vector.ph -; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB10_3 Depth 2 -; CHECK-NEXT: dls lr, r4 -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: .LBB10_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB10_2 Depth=1 +; CHECK-NEXT: b .LBB10_4 +; CHECK-NEXT: .LBB10_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB10_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q2, [q1, #508]! ; CHECK-NEXT: vstrb.8 q2, [r0], #16 -; CHECK-NEXT: le lr, .LBB10_3 -; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB10_2 Depth=1 +; CHECK-NEXT: le lr, .LBB10_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB10_4 Depth=1 ; CHECK-NEXT: cmp r12, r2 -; CHECK-NEXT: bne .LBB10_2 -; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup +; CHECK-NEXT: beq .LBB10_5 +; CHECK-NEXT: .LBB10_4: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB10_2 Depth 2 +; CHECK-NEXT: dls lr, r4 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: b .LBB10_2 +; CHECK-NEXT: .LBB10_5: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: @@ -578,15 +584,9 @@ ; CHECK-NEXT: adr r6, .LCPI11_0 ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill -; CHECK-NEXT: .LBB11_2: @ %vector.ph -; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: .LBB11_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 +; CHECK-NEXT: b .LBB11_4 +; CHECK-NEXT: .LBB11_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB11_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vmov.u16 r7, q2[6] ; CHECK-NEXT: vmov.u16 r3, q2[4] @@ -632,11 +632,19 @@ ; CHECK-NEXT: vmov.16 q3[6], r5 ; CHECK-NEXT: vmov.16 q3[7], r6 ; CHECK-NEXT: vstrb.8 q3, [r4], #16 -; CHECK-NEXT: le lr, .LBB11_3 -; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 +; CHECK-NEXT: le lr, .LBB11_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=1 ; CHECK-NEXT: cmp r8, r2 -; CHECK-NEXT: bne .LBB11_2 +; CHECK-NEXT: beq .LBB11_5 +; CHECK-NEXT: .LBB11_4: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB11_2 Depth 2 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: b .LBB11_2 ; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9} @@ -717,17 +725,9 @@ ; CHECK-NEXT: str r1, [sp, #52] @ 4-byte Spill ; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: .LBB12_2: @ %vector.ph -; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #52] @ 4-byte Reload -; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr r4, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: .LBB12_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 +; CHECK-NEXT: b .LBB12_4 +; CHECK-NEXT: .LBB12_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB12_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vmov.u16 r3, q5[2] ; CHECK-NEXT: vmov.u16 r5, q5[0] @@ -864,12 +864,22 @@ ; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: vstrb.8 q0, [r4], #16 -; CHECK-NEXT: le lr, .LBB12_3 -; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 +; CHECK-NEXT: le lr, .LBB12_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB12_4 Depth=1 ; CHECK-NEXT: ldr r1, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: cmp r1, r2 -; CHECK-NEXT: bne.w .LBB12_2 +; CHECK-NEXT: beq .LBB12_5 +; CHECK-NEXT: .LBB12_4: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB12_2 Depth 2 +; CHECK-NEXT: ldr r1, [sp, #52] @ 4-byte Reload +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: ldr r4, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: b .LBB12_2 ; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #104 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -460,29 +460,23 @@ ; CHECK-NEXT: vldrw.u32 q2, [r7] ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader +; CHECK-NEXT: b .LBB9_2 +; CHECK-NEXT: .LBB9_1: @ %for.cond4.for.cond.cleanup6_crit_edge.us +; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=1 +; CHECK-NEXT: add.w r8, r8, #1 +; CHECK-NEXT: cmp r8, r3 +; CHECK-NEXT: beq .LBB9_6 +; CHECK-NEXT: .LBB9_2: @ %for.cond8.preheader.us.us.preheader ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB9_2 Depth 2 +; CHECK-NEXT: @ Child Loop BB9_5 Depth 2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 ; CHECK-NEXT: mul r11, r8, r9 ; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: mul r7, r8, r12 -; CHECK-NEXT: .LBB9_2: @ %vector.ph -; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 -; CHECK-NEXT: @ => This Loop Header: Depth=2 -; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 -; CHECK-NEXT: vdup.32 q5, r7 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vshl.i32 q5, q5, #2 -; CHECK-NEXT: vmov q6, q1 -; CHECK-NEXT: vadd.i32 q5, q5, r0 -; CHECK-NEXT: dls lr, r10 -; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vadd.i32 q5, q5, q0 -; CHECK-NEXT: vmlas.u32 q6, q2, r5 +; CHECK-NEXT: b .LBB9_5 ; CHECK-NEXT: .LBB9_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 -; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 +; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1 +; CHECK-NEXT: @ Parent Loop BB9_5 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 ; CHECK-NEXT: vadd.i32 q7, q6, q3 ; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] @@ -492,19 +486,28 @@ ; CHECK-NEXT: vadd.i32 q4, q0, q4 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 +; CHECK-NEXT: @ in Loop: Header=BB9_5 Depth=2 ; CHECK-NEXT: add.w r4, r5, r11 ; CHECK-NEXT: adds r5, #1 ; CHECK-NEXT: vaddv.u32 r6, q4 ; CHECK-NEXT: cmp r5, r9 ; CHECK-NEXT: str.w r6, [r2, r4, lsl #2] -; CHECK-NEXT: bne .LBB9_2 -; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us -; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1 -; CHECK-NEXT: add.w r8, r8, #1 -; CHECK-NEXT: cmp r8, r3 -; CHECK-NEXT: bne .LBB9_1 -; CHECK-NEXT: @ %bb.6: @ %for.end25 +; CHECK-NEXT: beq .LBB9_1 +; CHECK-NEXT: .LBB9_5: @ %vector.ph +; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1 +; CHECK-NEXT: @ => This Loop Header: Depth=2 +; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 +; CHECK-NEXT: vdup.32 q5, r7 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vshl.i32 q5, q5, #2 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vadd.i32 q5, q5, r0 +; CHECK-NEXT: dls lr, r10 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vadd.i32 q5, q5, q0 +; CHECK-NEXT: vmlas.u32 q6, q2, r5 +; CHECK-NEXT: b .LBB9_3 +; CHECK-NEXT: .LBB9_6: @ %for.end25 ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 @@ -861,36 +864,43 @@ ; CHECK-NEXT: movs r6, #11 ; CHECK-NEXT: vshl.i32 q1, q1, #2 ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: .LBB11_1: @ %for.body10.i +; CHECK-NEXT: b .LBB11_2 +; CHECK-NEXT: .LBB11_1: @ %for.cond.cleanup20.i +; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 +; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #148] +; CHECK-NEXT: adds r5, #1 +; CHECK-NEXT: cmp r5, r7 +; CHECK-NEXT: it eq +; CHECK-NEXT: moveq r5, #0 +; CHECK-NEXT: .LBB11_2: @ %for.body10.i ; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB11_2 Depth 2 -; CHECK-NEXT: @ Child Loop BB11_3 Depth 3 -; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 -; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 +; CHECK-NEXT: @ Child Loop BB11_4 Depth 2 +; CHECK-NEXT: @ Child Loop BB11_9 Depth 3 +; CHECK-NEXT: @ Child Loop BB11_5 Depth 4 +; CHECK-NEXT: @ Child Loop BB11_6 Depth 5 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: .LBB11_2: @ %for.cond22.preheader.i -; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 +; CHECK-NEXT: b .LBB11_4 +; CHECK-NEXT: .LBB11_3: @ %for.cond.cleanup26.i +; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=2 +; CHECK-NEXT: adds r7, #1 +; CHECK-NEXT: cmp r7, r3 +; CHECK-NEXT: beq .LBB11_1 +; CHECK-NEXT: .LBB11_4: @ %for.cond22.preheader.i +; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 -; CHECK-NEXT: @ Child Loop BB11_3 Depth 3 -; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 -; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 +; CHECK-NEXT: @ Child Loop BB11_9 Depth 3 +; CHECK-NEXT: @ Child Loop BB11_5 Depth 4 +; CHECK-NEXT: @ Child Loop BB11_6 Depth 5 ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: .LBB11_3: @ %for.body27.i -; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 -; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2 -; CHECK-NEXT: @ => This Loop Header: Depth=3 -; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 -; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: dls lr, r9 -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: mov.w r11, #4 -; CHECK-NEXT: .LBB11_4: @ %for.body78.us.i -; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 -; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2 -; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 +; CHECK-NEXT: b .LBB11_9 +; CHECK-NEXT: .LBB11_5: @ %for.body78.us.i +; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 +; CHECK-NEXT: @ Parent Loop BB11_4 Depth=2 +; CHECK-NEXT: @ Parent Loop BB11_9 Depth=3 ; CHECK-NEXT: @ => This Loop Header: Depth=4 -; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 +; CHECK-NEXT: @ Child Loop BB11_6 Depth 5 ; CHECK-NEXT: mul r4, r11, r6 ; CHECK-NEXT: vdup.32 q3, r5 ; CHECK-NEXT: vdup.32 q2, r7 @@ -900,11 +910,11 @@ ; CHECK-NEXT: vadd.i32 q4, q0, r4 ; CHECK-NEXT: mov r4, r8 ; CHECK-NEXT: vmla.u32 q2, q4, r2 -; CHECK-NEXT: .LBB11_5: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 -; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2 -; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 -; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4 +; CHECK-NEXT: .LBB11_6: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 +; CHECK-NEXT: @ Parent Loop BB11_4 Depth=2 +; CHECK-NEXT: @ Parent Loop BB11_9 Depth=3 +; CHECK-NEXT: @ Parent Loop BB11_5 Depth=4 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=5 ; CHECK-NEXT: vldrb.s32 q6, [r0, q2] ; CHECK-NEXT: vadd.i32 q5, q2, q1 @@ -915,31 +925,27 @@ ; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vmlava.u32 r12, q2, q6 ; CHECK-NEXT: vmov q2, q5 -; CHECK-NEXT: bne .LBB11_5 -; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4 +; CHECK-NEXT: bne .LBB11_6 +; CHECK-NEXT: @ %bb.7: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB11_5 Depth=4 ; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: le lr, .LBB11_4 -; CHECK-NEXT: @ %bb.7: @ %for.cond.cleanup77.i -; CHECK-NEXT: @ in Loop: Header=BB11_3 Depth=3 +; CHECK-NEXT: le lr, .LBB11_5 +; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup77.i +; CHECK-NEXT: @ in Loop: Header=BB11_9 Depth=3 ; CHECK-NEXT: adds r5, #1 ; CHECK-NEXT: add.w r10, r10, #1 ; CHECK-NEXT: cmp r5, r2 -; CHECK-NEXT: bne .LBB11_3 -; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup26.i -; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=2 -; CHECK-NEXT: adds r7, #1 -; CHECK-NEXT: cmp r7, r3 -; CHECK-NEXT: bne .LBB11_2 -; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup20.i -; CHECK-NEXT: @ in Loop: Header=BB11_1 Depth=1 -; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r7, [sp, #148] -; CHECK-NEXT: adds r5, #1 -; CHECK-NEXT: cmp r5, r7 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq r5, #0 -; CHECK-NEXT: b .LBB11_1 +; CHECK-NEXT: beq .LBB11_3 +; CHECK-NEXT: .LBB11_9: @ %for.body27.i +; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 +; CHECK-NEXT: @ Parent Loop BB11_4 Depth=2 +; CHECK-NEXT: @ => This Loop Header: Depth=3 +; CHECK-NEXT: @ Child Loop BB11_5 Depth 4 +; CHECK-NEXT: @ Child Loop BB11_6 Depth 5 +; CHECK-NEXT: dls lr, r9 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: mov.w r11, #4 +; CHECK-NEXT: b .LBB11_5 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.10: ; CHECK-NEXT: .LCPI11_0: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll @@ -17,23 +17,25 @@ ; CHECK-NEXT: adr r3, .LCPI0_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: .LBB0_2: @ %vector.ph -; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 -; CHECK-NEXT: dls lr, r4 -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: .LBB0_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 +; CHECK-NEXT: b .LBB0_4 +; CHECK-NEXT: .LBB0_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB0_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q2, [q1, #16]! ; CHECK-NEXT: vstrb.8 q2, [r0], #16 -; CHECK-NEXT: le lr, .LBB0_3 -; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: le lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: cmp r12, r2 -; CHECK-NEXT: bne .LBB0_2 -; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup +; CHECK-NEXT: beq .LBB0_5 +; CHECK-NEXT: .LBB0_4: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB0_2 Depth 2 +; CHECK-NEXT: dls lr, r4 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -1866,8 +1866,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: beq .LBB11_8 +; CHECK-NEXT: cbz r3, .LBB11_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB11_3 @@ -2132,8 +2131,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: beq .LBB13_8 +; CHECK-NEXT: cbz r3, .LBB13_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB13_3 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -160,24 +160,26 @@ ; CHECK-NEXT: vadd.i32 q4, q3, r0 ; CHECK-NEXT: vldrw.u32 q3, [r12] ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: .LBB3_2: @ %vector.ph -; CHECK-NEXT: @ =>This Loop Header: Depth=1 -; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 -; CHECK-NEXT: dls lr, r3 -; CHECK-NEXT: vmov q6, q4 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: .LBB3_3: @ %vector.body -; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 +; CHECK-NEXT: b .LBB3_4 +; CHECK-NEXT: .LBB3_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB3_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vstrw.32 q0, [q5, #48]! ; CHECK-NEXT: vstrw.32 q1, [q6, #48]! ; CHECK-NEXT: vstrw.32 q2, [q7, #48]! -; CHECK-NEXT: le lr, .LBB3_3 -; CHECK-NEXT: @ %bb.4: @ %middle.block -; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 +; CHECK-NEXT: le lr, .LBB3_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB3_4 Depth=1 ; CHECK-NEXT: cmp r2, r1 -; CHECK-NEXT: bne .LBB3_2 +; CHECK-NEXT: beq .LBB3_5 +; CHECK-NEXT: .LBB3_4: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB3_2 Depth 2 +; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: b .LBB3_2 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: b .LBB0_7 ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: b .LBB0_9 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB0_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -45,7 +45,7 @@ ; CHECK-NEXT: ldr r1, [r2], #4 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: le lr, .LBB0_8 -; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup +; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -206,8 +206,8 @@ ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: b .LBB2_7 ; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: mov.w r2, #-1 -; CHECK-NEXT: b .LBB2_9 +; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB2_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -306,8 +306,8 @@ ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: b .LBB3_7 ; CHECK-NEXT: .LBB3_3: -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: b .LBB3_9 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB3_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -406,8 +406,8 @@ ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: b .LBB4_7 ; CHECK-NEXT: .LBB4_3: -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: b .LBB4_9 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB4_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -507,7 +507,8 @@ ; CHECK-NEXT: b .LBB5_7 ; CHECK-NEXT: .LBB5_3: ; CHECK-NEXT: vldr s0, .LCPI5_0 -; CHECK-NEXT: b .LBB5_9 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB5_4: @ %vector.ph ; CHECK-NEXT: bic r2, r1, #3 ; CHECK-NEXT: movs r3, #1 @@ -608,7 +609,8 @@ ; CHECK-NEXT: b .LBB6_7 ; CHECK-NEXT: .LBB6_3: ; CHECK-NEXT: vmov.f32 s0, #1.000000e+00 -; CHECK-NEXT: b .LBB6_9 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB6_4: @ %vector.ph ; CHECK-NEXT: bic r2, r1, #3 ; CHECK-NEXT: movs r3, #1 @@ -704,8 +706,8 @@ ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: b .LBB7_7 ; CHECK-NEXT: .LBB7_3: -; CHECK-NEXT: mvn r2, #-2147483648 -; CHECK-NEXT: b .LBB7_9 +; CHECK-NEXT: mvn r0, #-2147483648 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB7_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -804,7 +806,7 @@ ; CHECK-NEXT: b .LBB8_7 ; CHECK-NEXT: .LBB8_3: ; CHECK-NEXT: mvn r0, #-2147483648 -; CHECK-NEXT: b .LBB8_9 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB8_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -832,7 +834,7 @@ ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: csel r0, r0, r1, lt ; CHECK-NEXT: le lr, .LBB8_8 -; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup +; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -900,8 +902,8 @@ ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: b .LBB9_7 ; CHECK-NEXT: .LBB9_3: -; CHECK-NEXT: mov.w r2, #-2147483648 -; CHECK-NEXT: b .LBB9_9 +; CHECK-NEXT: mov.w r0, #-2147483648 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB9_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -1000,7 +1002,7 @@ ; CHECK-NEXT: b .LBB10_7 ; CHECK-NEXT: .LBB10_3: ; CHECK-NEXT: mov.w r0, #-2147483648 -; CHECK-NEXT: b .LBB10_9 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB10_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -1028,7 +1030,7 @@ ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: csel r0, r0, r1, gt ; CHECK-NEXT: le lr, .LBB10_8 -; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup +; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -1096,8 +1098,8 @@ ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: b .LBB11_7 ; CHECK-NEXT: .LBB11_3: -; CHECK-NEXT: mov.w r2, #-1 -; CHECK-NEXT: b .LBB11_9 +; CHECK-NEXT: mov.w r0, #-1 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB11_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -1196,7 +1198,7 @@ ; CHECK-NEXT: b .LBB12_7 ; CHECK-NEXT: .LBB12_3: ; CHECK-NEXT: mov.w r0, #-1 -; CHECK-NEXT: b .LBB12_9 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB12_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -1224,7 +1226,7 @@ ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: csel r0, r0, r1, hi ; CHECK-NEXT: le lr, .LBB12_8 -; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup +; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -1292,8 +1294,8 @@ ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: b .LBB13_7 ; CHECK-NEXT: .LBB13_3: -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: b .LBB13_9 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB13_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -1392,7 +1394,7 @@ ; CHECK-NEXT: b .LBB14_7 ; CHECK-NEXT: .LBB14_3: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: b .LBB14_9 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB14_4: @ %vector.ph ; CHECK-NEXT: bic r3, r1, #3 ; CHECK-NEXT: movs r2, #1 @@ -1420,7 +1422,7 @@ ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: csel r0, r0, r1, hi ; CHECK-NEXT: le lr, .LBB14_8 -; CHECK-NEXT: .LBB14_9: @ %for.cond.cleanup +; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: %cmp6 = icmp sgt i32 %n, 0 @@ -1489,7 +1491,8 @@ ; CHECK-NEXT: b .LBB15_7 ; CHECK-NEXT: .LBB15_3: ; CHECK-NEXT: vldr s0, .LCPI15_0 -; CHECK-NEXT: b .LBB15_9 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB15_4: @ %vector.ph ; CHECK-NEXT: bic r2, r1, #3 ; CHECK-NEXT: movs r3, #1 @@ -1594,7 +1597,8 @@ ; CHECK-NEXT: b .LBB16_7 ; CHECK-NEXT: .LBB16_3: ; CHECK-NEXT: vldr s0, .LCPI16_0 -; CHECK-NEXT: b .LBB16_9 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB16_4: @ %vector.ph ; CHECK-NEXT: bic r2, r1, #3 ; CHECK-NEXT: movs r3, #1 @@ -1701,8 +1705,7 @@ ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB17_4: -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp6.not = icmp eq i32 %n, 0 @@ -1752,8 +1755,7 @@ ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB18_4: -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp8.not = icmp eq i32 %n, 0 @@ -1806,8 +1808,7 @@ ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB19_4: -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp6.not = icmp eq i32 %n, 0 @@ -1858,8 +1859,7 @@ ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB20_4: -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp9.not = icmp eq i32 %n, 0 @@ -1914,8 +1914,7 @@ ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB21_4: -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp6.not = icmp eq i32 %n, 0 @@ -1966,8 +1965,7 @@ ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB22_4: -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: pop {r7, pc} entry: %cmp9.not = icmp eq i32 %n, 0 @@ -2327,7 +2325,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: cbz r1, .LBB29_3 +; CHECK-NEXT: cbz r1, .LBB29_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: mov r3, r2 @@ -2337,14 +2335,14 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vaddlva.s32 r2, r3, q0 ; CHECK-NEXT: letp lr, .LBB29_2 -; CHECK-NEXT: b .LBB29_4 -; CHECK-NEXT: .LBB29_3: -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: mov r3, r2 -; CHECK-NEXT: .LBB29_4: @ %for.cond.cleanup +; CHECK-NEXT: .LBB29_3: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .LBB29_4: +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: b .LBB29_3 entry: %cmp6.not = icmp eq i32 %n, 0 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph @@ -2380,7 +2378,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: cbz r2, .LBB30_3 +; CHECK-NEXT: cbz r2, .LBB30_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: mov r3, r12 @@ -2391,14 +2389,14 @@ ; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vmlalva.s32 r12, r3, q1, q0 ; CHECK-NEXT: letp lr, .LBB30_2 -; CHECK-NEXT: b .LBB30_4 -; CHECK-NEXT: .LBB30_3: -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: .LBB30_4: @ %for.cond.cleanup +; CHECK-NEXT: .LBB30_3: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .LBB30_4: +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: b .LBB30_3 entry: %cmp9.not = icmp eq i32 %n, 0 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph @@ -2439,7 +2437,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: cbz r2, .LBB31_3 +; CHECK-NEXT: cbz r2, .LBB31_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: mov r3, r12 @@ -2450,14 +2448,14 @@ ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 ; CHECK-NEXT: vmlalva.s16 r12, r3, q1, q0 ; CHECK-NEXT: letp lr, .LBB31_2 -; CHECK-NEXT: b .LBB31_4 -; CHECK-NEXT: .LBB31_3: -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: .LBB31_4: @ %for.cond.cleanup +; CHECK-NEXT: .LBB31_3: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .LBB31_4: +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: b .LBB31_3 entry: %cmp9.not = icmp eq i32 %n, 0 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph diff --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll @@ -6,8 +6,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: beq .LBB0_8 +; CHECK-NEXT: cbz r2, .LBB0_8 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: cmp r2, #8 ; CHECK-NEXT: blo .LBB0_9