Index: llvm/lib/Analysis/ScalarEvolution.cpp =================================================================== --- llvm/lib/Analysis/ScalarEvolution.cpp +++ llvm/lib/Analysis/ScalarEvolution.cpp @@ -2601,6 +2601,19 @@ } } + // Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y) + if (Ops.size() == 2) { + const SCEVMulExpr *Mul = dyn_cast(Ops[0]); + if (Mul && Mul->getNumOperands() == 2 && + Mul->getOperand(0)->isAllOnesValue()) { + const SCEV *X; + const SCEV *Y; + if (matchURem(Mul->getOperand(1), X, Y) && X == Ops[1]) { + return getMulExpr(Y, getUDivExpr(X, Y)); + } + } + } + // Skip past any other cast SCEVs. while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr) ++Idx; Index: llvm/test/Analysis/ScalarEvolution/urem-0.ll =================================================================== --- llvm/test/Analysis/ScalarEvolution/urem-0.ll +++ llvm/test/Analysis/ScalarEvolution/urem-0.ll @@ -62,7 +62,7 @@ ; CHECK-NEXT: %urem = urem i32 %arg, 8 ; CHECK-NEXT: --> (zext i3 (trunc i32 %arg to i3) to i32) U: [0,8) S: [0,8) ; CHECK-NEXT: %sub = sub i32 %arg, %urem -; CHECK-NEXT: --> ((-1 * (zext i3 (trunc i32 %arg to i3) to i32)) + %arg) U: full-set S: full-set +; CHECK-NEXT: --> (8 * (%arg /u 8)) U: [0,-7) S: [-2147483648,2147483641) ; CHECK-NEXT: Determining loop execution counts for: @test_sub_urem ; %urem = urem i32 %arg, 8 @@ -78,7 +78,7 @@ ; CHECK-NEXT: %zext = zext i3 %trunc to i32 ; CHECK-NEXT: --> (zext i3 (trunc i32 %arg to i3) to i32) U: [0,8) S: [0,8) ; CHECK-NEXT: %sub = sub i32 %arg, %zext -; CHECK-NEXT: --> ((-1 * (zext i3 (trunc i32 %arg to i3) to i32)) + %arg) U: full-set S: full-set +; CHECK-NEXT: --> (8 * (%arg /u 8)) U: [0,-7) S: [-2147483648,2147483641) ; CHECK-NEXT: Determining loop execution counts for: @test_trunc_zext ; %trunc = trunc i32 %arg to i3 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -6,7 +6,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: beq .LBB0_11 +; CHECK-NEXT: beq.w .LBB0_11 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck ; CHECK-NEXT: add.w r5, r0, r3, lsl #2 ; CHECK-NEXT: add.w r4, r2, r3, lsl #2 @@ -42,19 +42,21 @@ ; CHECK-NEXT: letp lr, .LBB0_5 ; CHECK-NEXT: b .LBB0_11 ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w lr, r3, r12 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: add.w lr, r5, r3, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB0_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r5, r1, r4 ; CHECK-NEXT: adds r6, r2, r4 ; CHECK-NEXT: adds r7, r0, r4 -; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: vldr s0, [r5] ; CHECK-NEXT: adds r4, #16 +; CHECK-NEXT: vldr s0, [r5] +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vldr s2, [r6] -; CHECK-NEXT: cmp lr, r3 ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r7] ; CHECK-NEXT: vldr s0, [r5, #4] @@ -69,7 +71,7 @@ ; CHECK-NEXT: vldr s2, [r6, #12] ; CHECK-NEXT: vmul.f32 s0, s2, s0 ; CHECK-NEXT: vstr s0, [r7, #12] -; CHECK-NEXT: bne .LBB0_7 +; CHECK-NEXT: le lr, .LBB0_7 ; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r12, .LBB0_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -1427,17 +1427,21 @@ ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: subs r3, r2, #1 ; CHECK-NEXT: and r12, r2, #3 -; CHECK-NEXT: vldr s0, .LCPI9_0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhs .LBB9_4 ; CHECK-NEXT: @ %bb.2: +; CHECK-NEXT: vldr s0, .LCPI9_0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: b .LBB9_6 ; CHECK-NEXT: .LBB9_3: ; CHECK-NEXT: vldr s0, .LCPI9_0 ; CHECK-NEXT: b .LBB9_9 ; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new -; CHECK-NEXT: sub.w lr, r2, r12 +; CHECK-NEXT: bic r2, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vldr s0, .LCPI9_0 +; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB9_5: @ %for.body @@ -1455,19 +1459,18 @@ ; CHECK-NEXT: vmul.f16 s4, s6, s4 ; CHECK-NEXT: vldr.16 s6, [r4, #2] ; CHECK-NEXT: vcvtb.f32.f16 s4, s4 -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vmul.f16 s6, s8, s6 ; CHECK-NEXT: vldr.16 s8, [r4] ; CHECK-NEXT: vcvtb.f32.f16 s6, s6 -; CHECK-NEXT: adds r3, #8 +; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: vmul.f16 s8, s10, s8 -; CHECK-NEXT: cmp lr, r2 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8 ; CHECK-NEXT: vadd.f32 s0, s0, s8 ; CHECK-NEXT: vadd.f32 s0, s0, s6 ; CHECK-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NEXT: bne .LBB9_5 +; CHECK-NEXT: le lr, .LBB9_5 ; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r12, .LBB9_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader @@ -1579,17 +1582,21 @@ ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: subs r3, r2, #1 ; CHECK-NEXT: and r12, r2, #3 -; CHECK-NEXT: vldr s0, .LCPI10_0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhs .LBB10_4 ; CHECK-NEXT: @ %bb.2: +; CHECK-NEXT: vldr s0, .LCPI10_0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: b .LBB10_6 ; CHECK-NEXT: .LBB10_3: ; CHECK-NEXT: vldr s0, .LCPI10_0 ; CHECK-NEXT: b .LBB10_9 ; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new -; CHECK-NEXT: sub.w lr, r2, r12 +; CHECK-NEXT: bic r2, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vldr s0, .LCPI10_0 +; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB10_5: @ %for.body @@ -1607,19 +1614,18 @@ ; CHECK-NEXT: vadd.f16 s4, s6, s4 ; CHECK-NEXT: vldr.16 s6, [r4, #2] ; CHECK-NEXT: vcvtb.f32.f16 s4, s4 -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: adds r3, #8 ; CHECK-NEXT: vadd.f16 s6, s8, s6 ; CHECK-NEXT: vldr.16 s8, [r4] ; CHECK-NEXT: vcvtb.f32.f16 s6, s6 -; CHECK-NEXT: adds r3, #8 +; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: vadd.f16 s8, s10, s8 -; CHECK-NEXT: cmp lr, r2 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8 ; CHECK-NEXT: vadd.f32 s0, s0, s8 ; CHECK-NEXT: vadd.f32 s0, s0, s6 ; CHECK-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NEXT: bne .LBB10_5 +; CHECK-NEXT: le lr, .LBB10_5 ; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r12, .LBB10_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader @@ -1731,26 +1737,29 @@ ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: subs r3, r2, #1 ; CHECK-NEXT: and r12, r2, #3 -; CHECK-NEXT: vldr s0, .LCPI11_0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhs .LBB11_4 ; CHECK-NEXT: @ %bb.2: +; CHECK-NEXT: vldr s0, .LCPI11_0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: b .LBB11_6 ; CHECK-NEXT: .LBB11_3: ; CHECK-NEXT: vldr s0, .LCPI11_0 ; CHECK-NEXT: b .LBB11_9 ; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new -; CHECK-NEXT: sub.w lr, r2, r12 -; CHECK-NEXT: adds r3, r1, #4 +; CHECK-NEXT: bic r2, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vldr s0, .LCPI11_0 ; CHECK-NEXT: adds r4, r0, #4 +; CHECK-NEXT: add.w lr, r3, r2, lsr #2 +; CHECK-NEXT: adds r3, r1, #4 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB11_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh.w r5, [r3, #2] ; CHECK-NEXT: vldr.16 s2, [r4, #2] ; CHECK-NEXT: adds r2, #4 -; CHECK-NEXT: cmp lr, r2 ; CHECK-NEXT: vmov s4, r5 ; CHECK-NEXT: ldrsh r5, [r3], #8 ; CHECK-NEXT: vcvt.f16.s32 s4, s4 @@ -1773,12 +1782,12 @@ ; CHECK-NEXT: vmul.f16 s8, s8, s10 ; CHECK-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-NEXT: vcvtb.f32.f16 s8, s8 -; CHECK-NEXT: add.w r4, r4, #8 +; CHECK-NEXT: adds r4, #8 ; CHECK-NEXT: vadd.f32 s0, s0, s8 ; CHECK-NEXT: vadd.f32 s0, s0, s6 ; CHECK-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NEXT: bne .LBB11_5 +; CHECK-NEXT: le lr, .LBB11_5 ; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa ; CHECK-NEXT: wls lr, r12, .LBB11_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -328,35 +328,35 @@ ; CHECK-LABEL: test_vec_mul_scalar_add_char: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr.w r12, [sp, #28] -; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: ldr r4, [sp, #28] +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: beq.w .LBB5_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12 +; CHECK-NEXT: add.w r5, r3, r4, lsl #2 +; CHECK-NEXT: adds r6, r1, r4 ; CHECK-NEXT: cmp r5, r1 -; CHECK-NEXT: add.w r4, r0, r12 -; CHECK-NEXT: cset r7, hi +; CHECK-NEXT: add.w r7, r0, r4 +; CHECK-NEXT: cset r12, hi ; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r3 -; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: tst r4, r5 +; CHECK-NEXT: cmp r7, r3 +; CHECK-NEXT: cset r7, hi +; CHECK-NEXT: tst r7, r5 ; CHECK-NEXT: it eq -; CHECK-NEXT: andseq.w r7, r7, r6 +; CHECK-NEXT: andseq.w r7, r6, r12 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and lr, r12, #3 -; CHECK-NEXT: cmp r4, #3 +; CHECK-NEXT: subs r7, r4, #1 +; CHECK-NEXT: and r12, r4, #3 +; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: b .LBB5_8 ; CHECK-NEXT: .LBB5_4: @ %vector.ph -; CHECK-NEXT: dlstp.32 lr, r12 +; CHECK-NEXT: dlstp.32 lr, r4 ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 @@ -366,17 +366,19 @@ ; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_11 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r8, r12, lr +; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r5, r3, #8 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: adds r6, r0, #3 ; CHECK-NEXT: adds r7, r1, #1 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r9, [r6, #-3] -; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: add.w r8, r8, #4 ; CHECK-NEXT: ldrb r4, [r7, #-1] -; CHECK-NEXT: cmp r8, r12 ; CHECK-NEXT: smlabb r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #-8] ; CHECK-NEXT: ldrb r9, [r6, #-2] @@ -391,14 +393,14 @@ ; CHECK-NEXT: ldrb r4, [r7, #-2] ; CHECK-NEXT: smlabb r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #4] -; CHECK-NEXT: add.w r5, r5, #16 -; CHECK-NEXT: bne .LBB5_7 +; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: le lr, .LBB5_7 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, lr, .LBB5_11 +; CHECK-NEXT: wls lr, r12, .LBB5_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader -; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: add r1, r12 -; CHECK-NEXT: add.w r3, r3, r12, lsl #2 +; CHECK-NEXT: add r0, r8 +; CHECK-NEXT: add r1, r8 +; CHECK-NEXT: add.w r3, r3, r8, lsl #2 ; CHECK-NEXT: .LBB5_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r7, [r0], #1 @@ -602,35 +604,35 @@ ; CHECK-LABEL: test_vec_mul_scalar_add_uchar: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr.w r12, [sp, #28] -; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: ldr r4, [sp, #28] +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: beq.w .LBB7_11 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12 +; CHECK-NEXT: add.w r5, r3, r4, lsl #2 +; CHECK-NEXT: adds r6, r1, r4 ; CHECK-NEXT: cmp r5, r1 -; CHECK-NEXT: add.w r4, r0, r12 -; CHECK-NEXT: cset r7, hi +; CHECK-NEXT: add.w r7, r0, r4 +; CHECK-NEXT: cset r12, hi ; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r3 -; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: tst r4, r5 +; CHECK-NEXT: cmp r7, r3 +; CHECK-NEXT: cset r7, hi +; CHECK-NEXT: tst r7, r5 ; CHECK-NEXT: it eq -; CHECK-NEXT: andseq.w r7, r7, r6 +; CHECK-NEXT: andseq.w r7, r6, r12 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and lr, r12, #3 -; CHECK-NEXT: cmp r4, #3 +; CHECK-NEXT: subs r7, r4, #1 +; CHECK-NEXT: and r12, r4, #3 +; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: b .LBB7_8 ; CHECK-NEXT: .LBB7_4: @ %vector.ph -; CHECK-NEXT: dlstp.32 lr, r12 +; CHECK-NEXT: dlstp.32 lr, r4 ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrb.u32 q0, [r0], #4 @@ -640,17 +642,19 @@ ; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_11 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r8, r12, lr +; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r5, r3, #8 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: adds r6, r0, #3 ; CHECK-NEXT: adds r7, r1, #1 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r9, [r6, #-3] -; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: add.w r8, r8, #4 ; CHECK-NEXT: ldrb r4, [r7, #-1] -; CHECK-NEXT: cmp r8, r12 ; CHECK-NEXT: smlabb r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #-8] ; CHECK-NEXT: ldrb r9, [r6, #-2] @@ -665,14 +669,14 @@ ; CHECK-NEXT: ldrb r4, [r7, #-2] ; CHECK-NEXT: smlabb r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #4] -; CHECK-NEXT: add.w r5, r5, #16 -; CHECK-NEXT: bne .LBB7_7 +; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, lr, .LBB7_11 +; CHECK-NEXT: wls lr, r12, .LBB7_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader -; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: add r1, r12 -; CHECK-NEXT: add.w r3, r3, r12, lsl #2 +; CHECK-NEXT: add r0, r8 +; CHECK-NEXT: add r1, r8 +; CHECK-NEXT: add.w r3, r3, r8, lsl #2 ; CHECK-NEXT: .LBB7_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r7, [r0], #1 @@ -876,35 +880,35 @@ ; CHECK-LABEL: test_vec_mul_scalar_add_int: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr.w r12, [sp, #28] -; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: ldr r4, [sp, #28] +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: beq.w .LBB9_11 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck -; CHECK-NEXT: add.w r5, r3, r12, lsl #2 -; CHECK-NEXT: add.w r6, r1, r12, lsl #2 +; CHECK-NEXT: add.w r5, r3, r4, lsl #2 +; CHECK-NEXT: add.w r6, r1, r4, lsl #2 ; CHECK-NEXT: cmp r5, r1 -; CHECK-NEXT: add.w r4, r0, r12, lsl #2 -; CHECK-NEXT: cset r7, hi +; CHECK-NEXT: add.w r7, r0, r4, lsl #2 +; CHECK-NEXT: cset r12, hi ; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: cset r6, hi ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r3 -; CHECK-NEXT: cset r4, hi -; CHECK-NEXT: tst r4, r5 +; CHECK-NEXT: cmp r7, r3 +; CHECK-NEXT: cset r7, hi +; CHECK-NEXT: tst r7, r5 ; CHECK-NEXT: it eq -; CHECK-NEXT: andseq.w r7, r7, r6 +; CHECK-NEXT: andseq.w r7, r6, r12 ; CHECK-NEXT: beq .LBB9_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: and lr, r12, #3 -; CHECK-NEXT: cmp r4, #3 +; CHECK-NEXT: subs r7, r4, #1 +; CHECK-NEXT: and r12, r4, #3 +; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB9_6 ; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: b .LBB9_8 ; CHECK-NEXT: .LBB9_4: @ %vector.ph -; CHECK-NEXT: dlstp.32 lr, r12 +; CHECK-NEXT: dlstp.32 lr, r4 ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -914,17 +918,19 @@ ; CHECK-NEXT: letp lr, .LBB9_5 ; CHECK-NEXT: b .LBB9_11 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r8, r12, lr +; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r5, r3, #8 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: add.w r6, r0, #8 ; CHECK-NEXT: add.w r7, r1, #8 -; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB9_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r9, [r6, #-8] -; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: add.w r8, r8, #4 ; CHECK-NEXT: ldr r4, [r7, #-8] -; CHECK-NEXT: cmp r8, r12 ; CHECK-NEXT: mla r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #-8] ; CHECK-NEXT: ldr r9, [r6, #-4] @@ -936,19 +942,19 @@ ; CHECK-NEXT: mla r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5] ; CHECK-NEXT: ldr.w r9, [r6, #4] -; CHECK-NEXT: add.w r6, r6, #16 +; CHECK-NEXT: adds r6, #16 ; CHECK-NEXT: ldr r4, [r7, #4] -; CHECK-NEXT: add.w r7, r7, #16 +; CHECK-NEXT: adds r7, #16 ; CHECK-NEXT: mla r4, r4, r9, r2 ; CHECK-NEXT: str r4, [r5, #4] -; CHECK-NEXT: add.w r5, r5, #16 -; CHECK-NEXT: bne .LBB9_7 +; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: le lr, .LBB9_7 ; CHECK-NEXT: .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: wls lr, lr, .LBB9_11 +; CHECK-NEXT: wls lr, r12, .LBB9_11 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader -; CHECK-NEXT: add.w r0, r0, r12, lsl #2 -; CHECK-NEXT: add.w r1, r1, r12, lsl #2 -; CHECK-NEXT: add.w r3, r3, r12, lsl #2 +; CHECK-NEXT: add.w r0, r0, r8, lsl #2 +; CHECK-NEXT: add.w r1, r1, r8, lsl #2 +; CHECK-NEXT: add.w r3, r3, r8, lsl #2 ; CHECK-NEXT: .LBB9_10: @ %for.body.epil ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r7, [r0], #4 Index: llvm/test/Transforms/HardwareLoops/ARM/structure.ll =================================================================== --- llvm/test/Transforms/HardwareLoops/ARM/structure.ll +++ llvm/test/Transforms/HardwareLoops/ARM/structure.ll @@ -321,8 +321,7 @@ ; CHECK-UNROLL: [[PREHEADER:.LBB[0-9_]+]]: @ %for.body.preheader ; CHECK-UNROLL-NOT: dls ; CHECK-UNROLL: [[LOOP:.LBB[0-9_]+]]: @ %for.body -; CHECK-UNROLL-NOT: le lr, [[LOOP]] -; CHECK-UNROLL: bne [[LOOP]] +; CHECK-UNROLL: le lr, [[LOOP]] ; CHECK-UNROLL: wls lr, r12, [[EXIT:.LBB[0-9_]+]] ; CHECK-UNROLL: [[EPIL:.LBB[0-9_]+]]: ; CHECK-UNROLL: le lr, [[EPIL]] @@ -359,8 +358,7 @@ ; CHECK-UNROLL: [[PREHEADER:.LBB[0-9_]+]]: @ %for.body.preheader ; CHECK-UNROLL-NOT: dls ; CHECK-UNROLL: [[LOOP:.LBB[0-9_]+]]: @ %for.body -; CHECK-UNROLL-NOT: le lr, [[LOOP]] -; CHECK-UNROLL: bne [[LOOP]] +; CHECK-UNROLL: le lr, [[LOOP]] ; CHECK-UNROLL: wls lr, r12, [[EPIL_EXIT:.LBB[0-9_]+]] ; CHECK-UNROLL: [[EPIL:.LBB[0-9_]+]]: ; CHECK-UNROLL: le lr, [[EPIL]] Index: llvm/test/Transforms/LoopUnroll/runtime-loop5.ll =================================================================== --- llvm/test/Transforms/LoopUnroll/runtime-loop5.ll +++ llvm/test/Transforms/LoopUnroll/runtime-loop5.ll @@ -95,7 +95,7 @@ ; UNROLL-4-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i3, i3* [[A]], i64 [[INDVARS_IV_NEXT_2]] ; UNROLL-4-NEXT: [[TMP5:%.*]] = load i3, i3* [[ARRAYIDX_3]], align 1 ; UNROLL-4-NEXT: [[ADD_3]] = add nsw i3 [[TMP5]], [[ADD_2]] -; UNROLL-4-NEXT: [[INDVARS_IV_NEXT_3]] = add i64 [[INDVARS_IV_NEXT_2]], 1 +; UNROLL-4-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1 ; UNROLL-4-NEXT: [[NITER_NEXT_3]] = add i3 [[NITER_NEXT_2]], 1 ; UNROLL-4-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i3 [[NITER_NEXT_3]], [[UNROLL_ITER]] ; UNROLL-4-NEXT: br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]