Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -369,11 +369,18 @@ // isn't max. This is evaluated in unsigned, because the semantics // of @get.active.lane.mask is a ULE comparison. - int VectorWidth = VecTy->getNumElements(); auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); auto *BTC = SE->getSCEV(BackedgeTakenCount); + auto *MaxBTC = SE->getConstantMaxBackedgeTakenCount(L); + + if (isa(MaxBTC)) { + LLVM_DEBUG(dbgs() << "ARM TP: Can't compute SCEV BTC expression: "; + BTC->dump()); + return false; + } - if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) && + APInt MaxInt = APInt(BTC->getType()->getScalarSizeInBits(), ~0); + if (dyn_cast(MaxBTC)->getAPInt().eq(MaxInt) && !ForceTailPredication) { LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: "; BTC->dump()); @@ -397,6 +404,7 @@ // auto *TC = SE->getSCEV(TripCount); unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits(); + int VectorWidth = VecTy->getNumElements(); auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); uint64_t MaxMinusVW = Diff.getZExtValue(); uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue(); @@ -404,7 +412,7 @@ if (UpperboundTC > MaxMinusVW && !ForceTailPredication) { LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n"; dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n"; - dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";); + dbgs() << UpperboundTC << " <= " << MaxMinusVW << " == false\n";); return false; } Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll @@ -1,6 +1,4 @@ ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s -; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=force-enabled \ -; RUN: -mattr=+mve %s -S -o - | FileCheck %s --check-prefix=FORCE ; CHECK-LABEL: reduction_i32 ; CHECK: phi i32 [ 0, %vector.ph ] @@ -136,16 +134,15 @@ ret i16 %res.0 } -; The vector loop is not guarded with an entry check (N == 0). -; This means we can't calculate a precise range for the backedge count in -; @llvm.get.active.lane.mask, and are assuming overflow can happen and thus -; we can't insert the VCTP here. +; The vector loop is not guarded with an entry check (N == 0). Check that +; despite this we can still calculate a precise enough range for the +; backedge count to safely insert a vctp here. ; ; CHECK-LABEL: @reduction_not_guarded ; ; CHECK: vector.body: -; CHECK-NOT: @llvm.arm.mve.vctp -; CHECK: @llvm.get.active.lane.mask.v8i1.i32 +; CHECK: @llvm.arm.mve.vctp +; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32 ; CHECK: ret ; define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { @@ -196,24 +193,10 @@ ret i16 %tmp9 } -; Without forcing tail-predication, we bail because overflow analysis says: -; -; overflow possible in: {(-1 + (sext i16 %Size to i32)),+,-1}<%for.body> -; ; CHECK-LABEL: @Correlation -; -; CHECK: vector.body: -; CHECK-NOT: @llvm.arm.mve.vctp -; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) -; -; FORCE-LABEL: @Correlation -; FORCE: vector.ph: ; preds = %for.body -; FORCE: %trip.count.minus.1 = add i32 %{{.*}}, -1 -; FORCE: call void @llvm.set.loop.iterations.i32(i32 %{{.*}}) -; FORCE: br label %vector.body -; FORCE: vector.body: ; preds = %vector.body, %vector.ph -; FORCE: %[[VCTP:.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 %{{.*}}) -; FORCE: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[VCTP]]{{.*}} +; CHECK: vector.body: +; CHECK: @llvm.arm.mve.vctp +; CHECK-NOT: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask ; define dso_local void @Correlation(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { entry: Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -3,33 +3,33 @@ ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled %s -o - | \ ; RUN: FileCheck %s --check-prefix=ENABLED ; +; Forcing tail-predication should not be necessary here, thus we check the same +; ENABLED label as run above: ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=force-enabled %s -o - | \ -; RUN: FileCheck %s --check-prefix=FORCE +; RUN: FileCheck %s --check-prefix=ENABLED ; ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled-no-reductions %s -o - | \ ; RUN: FileCheck %s --check-prefix=NOREDUCTIONS ; ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=force-enabled-no-reductions %s -o - | \ -; RUN: FileCheck %s --check-prefix=FORCENOREDUCTIONS +; RUN: FileCheck %s --check-prefix=NOREDUCTIONS define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr { ; ENABLED-LABEL: varying_outer_2d_reduction: ; ENABLED: @ %bb.0: @ %entry -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; ENABLED-NEXT: sub sp, #8 +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; ENABLED-NEXT: sub sp, #4 ; ENABLED-NEXT: cmp r3, #1 -; ENABLED-NEXT: str r0, [sp, #4] @ 4-byte Spill +; ENABLED-NEXT: str r0, [sp] @ 4-byte Spill ; ENABLED-NEXT: blt .LBB0_8 ; ENABLED-NEXT: @ %bb.1: @ %for.body.lr.ph -; ENABLED-NEXT: ldr r0, [sp, #44] -; ENABLED-NEXT: adr r7, .LCPI0_0 -; ENABLED-NEXT: ldr.w r10, [sp, #4] @ 4-byte Reload -; ENABLED-NEXT: add.w r9, r2, #3 -; ENABLED-NEXT: vldrw.u32 q0, [r7] -; ENABLED-NEXT: mov.w r11, #0 +; ENABLED-NEXT: ldr r0, [sp, #36] +; ENABLED-NEXT: add.w r12, r2, #3 +; ENABLED-NEXT: ldr.w r10, [sp] @ 4-byte Reload +; ENABLED-NEXT: movs r6, #0 +; ENABLED-NEXT: mov r9, r12 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 -; ENABLED-NEXT: str.w r9, [sp] @ 4-byte Spill ; ENABLED-NEXT: b .LBB0_4 ; ENABLED-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: movs r0, #0 @@ -37,149 +37,61 @@ ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: lsrs r0, r0, #16 ; ENABLED-NEXT: sub.w r9, r9, #1 -; ENABLED-NEXT: strh.w r0, [r1, r11, lsl #1] -; ENABLED-NEXT: add.w r11, r11, #1 +; ENABLED-NEXT: strh.w r0, [r1, r6, lsl #1] +; ENABLED-NEXT: adds r6, #1 ; ENABLED-NEXT: add.w r10, r10, #2 -; ENABLED-NEXT: cmp r11, r3 +; ENABLED-NEXT: cmp r6, r3 ; ENABLED-NEXT: beq .LBB0_8 ; ENABLED-NEXT: .LBB0_4: @ %for.body ; ENABLED-NEXT: @ =>This Loop Header: Depth=1 ; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2 -; ENABLED-NEXT: cmp r2, r11 +; ENABLED-NEXT: cmp r2, r6 ; ENABLED-NEXT: ble .LBB0_2 ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; ENABLED-NEXT: bic r7, r9, #3 -; ENABLED-NEXT: movs r6, #1 -; ENABLED-NEXT: subs r7, #4 -; ENABLED-NEXT: sub.w r0, r2, r11 -; ENABLED-NEXT: vmov.i32 q2, #0x0 -; ENABLED-NEXT: add.w r8, r6, r7, lsr #2 -; ENABLED-NEXT: ldr r7, [sp] @ 4-byte Reload -; ENABLED-NEXT: sub.w r4, r7, r11 -; ENABLED-NEXT: movs r7, #0 -; ENABLED-NEXT: bic r4, r4, #3 -; ENABLED-NEXT: subs r4, #4 -; ENABLED-NEXT: add.w r4, r6, r4, lsr #2 -; ENABLED-NEXT: subs r6, r0, #1 -; ENABLED-NEXT: dls lr, r4 -; ENABLED-NEXT: mov r4, r10 -; ENABLED-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; ENABLED-NEXT: bic r0, r9, #3 +; ENABLED-NEXT: movs r7, #1 +; ENABLED-NEXT: subs r0, #4 +; ENABLED-NEXT: subs r4, r2, r6 +; ENABLED-NEXT: vmov.i32 q0, #0x0 +; ENABLED-NEXT: add.w r8, r7, r0, lsr #2 +; ENABLED-NEXT: mov r7, r10 +; ENABLED-NEXT: dlstp.32 lr, r4 +; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; ENABLED-NEXT: vmov q1, q2 -; ENABLED-NEXT: vadd.i32 q2, q0, r7 -; ENABLED-NEXT: vdup.32 q3, r7 +; ENABLED-NEXT: vldrh.s32 q1, [r0], #8 +; ENABLED-NEXT: vldrh.s32 q2, [r7], #8 ; ENABLED-NEXT: mov lr, r8 -; ENABLED-NEXT: vcmp.u32 hi, q3, q2 -; ENABLED-NEXT: vdup.32 q3, r6 -; ENABLED-NEXT: vpnot +; ENABLED-NEXT: vmul.i32 q1, q2, q1 ; ENABLED-NEXT: sub.w r8, r8, #1 -; ENABLED-NEXT: vpsttt -; ENABLED-NEXT: vcmpt.u32 cs, q3, q2 -; ENABLED-NEXT: vldrht.s32 q2, [r0], #8 -; ENABLED-NEXT: vldrht.s32 q3, [r4], #8 -; ENABLED-NEXT: adds r7, #4 -; ENABLED-NEXT: vmul.i32 q2, q3, q2 -; ENABLED-NEXT: vshl.s32 q2, r5 -; ENABLED-NEXT: vadd.i32 q2, q2, q1 -; ENABLED-NEXT: le lr, .LBB0_6 +; ENABLED-NEXT: vshl.s32 q1, r5 +; ENABLED-NEXT: vadd.i32 q0, q1, q0 +; ENABLED-NEXT: letp lr, .LBB0_6 ; ENABLED-NEXT: @ %bb.7: @ %middle.block ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; ENABLED-NEXT: vpsel q1, q2, q1 -; ENABLED-NEXT: vaddv.u32 r0, q1 +; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 ; ENABLED-NEXT: .LBB0_8: @ %for.end17 -; ENABLED-NEXT: add sp, #8 -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} -; ENABLED-NEXT: .p2align 4 -; ENABLED-NEXT: @ %bb.9: -; ENABLED-NEXT: .LCPI0_0: -; ENABLED-NEXT: .long 0 @ 0x0 -; ENABLED-NEXT: .long 1 @ 0x1 -; ENABLED-NEXT: .long 2 @ 0x2 -; ENABLED-NEXT: .long 3 @ 0x3 -; -; FORCE-LABEL: varying_outer_2d_reduction: -; FORCE: @ %bb.0: @ %entry -; FORCE-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; FORCE-NEXT: sub sp, #4 -; FORCE-NEXT: cmp r3, #1 -; FORCE-NEXT: str r0, [sp] @ 4-byte Spill -; FORCE-NEXT: blt .LBB0_8 -; FORCE-NEXT: @ %bb.1: @ %for.body.lr.ph -; FORCE-NEXT: ldr r0, [sp, #36] -; FORCE-NEXT: add.w r12, r2, #3 -; FORCE-NEXT: ldr.w r10, [sp] @ 4-byte Reload -; FORCE-NEXT: movs r6, #0 -; FORCE-NEXT: mov r9, r12 -; FORCE-NEXT: uxth r0, r0 -; FORCE-NEXT: rsbs r5, r0, #0 -; FORCE-NEXT: b .LBB0_4 -; FORCE-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1 -; FORCE-NEXT: movs r0, #0 -; FORCE-NEXT: .LBB0_3: @ %for.end -; FORCE-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; FORCE-NEXT: lsrs r0, r0, #16 -; FORCE-NEXT: sub.w r9, r9, #1 -; FORCE-NEXT: strh.w r0, [r1, r6, lsl #1] -; FORCE-NEXT: adds r6, #1 -; FORCE-NEXT: add.w r10, r10, #2 -; FORCE-NEXT: cmp r6, r3 -; FORCE-NEXT: beq .LBB0_8 -; FORCE-NEXT: .LBB0_4: @ %for.body -; FORCE-NEXT: @ =>This Loop Header: Depth=1 -; FORCE-NEXT: @ Child Loop BB0_6 Depth 2 -; FORCE-NEXT: cmp r2, r6 -; FORCE-NEXT: ble .LBB0_2 -; FORCE-NEXT: @ %bb.5: @ %vector.ph -; FORCE-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; FORCE-NEXT: bic r0, r9, #3 -; FORCE-NEXT: movs r7, #1 -; FORCE-NEXT: subs r0, #4 -; FORCE-NEXT: subs r4, r2, r6 -; FORCE-NEXT: vmov.i32 q0, #0x0 -; FORCE-NEXT: add.w r8, r7, r0, lsr #2 -; FORCE-NEXT: mov r7, r10 -; FORCE-NEXT: dlstp.32 lr, r4 -; FORCE-NEXT: ldr r0, [sp] @ 4-byte Reload -; FORCE-NEXT: .LBB0_6: @ %vector.body -; FORCE-NEXT: @ Parent Loop BB0_4 Depth=1 -; FORCE-NEXT: @ => This Inner Loop Header: Depth=2 -; FORCE-NEXT: vldrh.s32 q1, [r0], #8 -; FORCE-NEXT: vldrh.s32 q2, [r7], #8 -; FORCE-NEXT: mov lr, r8 -; FORCE-NEXT: vmul.i32 q1, q2, q1 -; FORCE-NEXT: sub.w r8, r8, #1 -; FORCE-NEXT: vshl.s32 q1, r5 -; FORCE-NEXT: vadd.i32 q0, q1, q0 -; FORCE-NEXT: letp lr, .LBB0_6 -; FORCE-NEXT: @ %bb.7: @ %middle.block -; FORCE-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; FORCE-NEXT: vaddv.u32 r0, q0 -; FORCE-NEXT: b .LBB0_3 -; FORCE-NEXT: .LBB0_8: @ %for.end17 -; FORCE-NEXT: add sp, #4 -; FORCE-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; ENABLED-NEXT: add sp, #4 +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: ; NOREDUCTIONS: @ %bb.0: @ %entry -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; NOREDUCTIONS-NEXT: sub sp, #8 +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; NOREDUCTIONS-NEXT: sub sp, #4 ; NOREDUCTIONS-NEXT: cmp r3, #1 -; NOREDUCTIONS-NEXT: str r0, [sp, #4] @ 4-byte Spill +; NOREDUCTIONS-NEXT: str r0, [sp] @ 4-byte Spill ; NOREDUCTIONS-NEXT: blt .LBB0_8 ; NOREDUCTIONS-NEXT: @ %bb.1: @ %for.body.lr.ph -; NOREDUCTIONS-NEXT: ldr r0, [sp, #44] -; NOREDUCTIONS-NEXT: adr r7, .LCPI0_0 -; NOREDUCTIONS-NEXT: ldr.w r10, [sp, #4] @ 4-byte Reload -; NOREDUCTIONS-NEXT: add.w r9, r2, #3 -; NOREDUCTIONS-NEXT: vldrw.u32 q0, [r7] -; NOREDUCTIONS-NEXT: mov.w r11, #0 +; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] +; NOREDUCTIONS-NEXT: add.w r12, r2, #3 +; NOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload +; NOREDUCTIONS-NEXT: movs r6, #0 +; NOREDUCTIONS-NEXT: mov r9, r12 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 -; NOREDUCTIONS-NEXT: str.w r9, [sp] @ 4-byte Spill ; NOREDUCTIONS-NEXT: b .LBB0_4 ; NOREDUCTIONS-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: movs r0, #0 @@ -187,131 +99,46 @@ ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: lsrs r0, r0, #16 ; NOREDUCTIONS-NEXT: sub.w r9, r9, #1 -; NOREDUCTIONS-NEXT: strh.w r0, [r1, r11, lsl #1] -; NOREDUCTIONS-NEXT: add.w r11, r11, #1 +; NOREDUCTIONS-NEXT: strh.w r0, [r1, r6, lsl #1] +; NOREDUCTIONS-NEXT: adds r6, #1 ; NOREDUCTIONS-NEXT: add.w r10, r10, #2 -; NOREDUCTIONS-NEXT: cmp r11, r3 +; NOREDUCTIONS-NEXT: cmp r6, r3 ; NOREDUCTIONS-NEXT: beq .LBB0_8 ; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body ; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1 ; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2 -; NOREDUCTIONS-NEXT: cmp r2, r11 +; NOREDUCTIONS-NEXT: cmp r2, r6 ; NOREDUCTIONS-NEXT: ble .LBB0_2 ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; NOREDUCTIONS-NEXT: bic r7, r9, #3 -; NOREDUCTIONS-NEXT: movs r6, #1 -; NOREDUCTIONS-NEXT: subs r7, #4 -; NOREDUCTIONS-NEXT: sub.w r0, r2, r11 -; NOREDUCTIONS-NEXT: vmov.i32 q2, #0x0 -; NOREDUCTIONS-NEXT: add.w r8, r6, r7, lsr #2 -; NOREDUCTIONS-NEXT: ldr r7, [sp] @ 4-byte Reload -; NOREDUCTIONS-NEXT: sub.w r4, r7, r11 -; NOREDUCTIONS-NEXT: movs r7, #0 -; NOREDUCTIONS-NEXT: bic r4, r4, #3 -; NOREDUCTIONS-NEXT: subs r4, #4 -; NOREDUCTIONS-NEXT: add.w r4, r6, r4, lsr #2 -; NOREDUCTIONS-NEXT: subs r6, r0, #1 -; NOREDUCTIONS-NEXT: dls lr, r4 -; NOREDUCTIONS-NEXT: mov r4, r10 -; NOREDUCTIONS-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; NOREDUCTIONS-NEXT: bic r0, r9, #3 +; NOREDUCTIONS-NEXT: movs r7, #1 +; NOREDUCTIONS-NEXT: subs r0, #4 +; NOREDUCTIONS-NEXT: subs r4, r2, r6 +; NOREDUCTIONS-NEXT: vmov.i32 q0, #0x0 +; NOREDUCTIONS-NEXT: add.w r8, r7, r0, lsr #2 +; NOREDUCTIONS-NEXT: mov r7, r10 +; NOREDUCTIONS-NEXT: dlstp.32 lr, r4 +; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 -; NOREDUCTIONS-NEXT: vmov q1, q2 -; NOREDUCTIONS-NEXT: vadd.i32 q2, q0, r7 -; NOREDUCTIONS-NEXT: vdup.32 q3, r7 +; NOREDUCTIONS-NEXT: vldrh.s32 q1, [r0], #8 +; NOREDUCTIONS-NEXT: vldrh.s32 q2, [r7], #8 ; NOREDUCTIONS-NEXT: mov lr, r8 -; NOREDUCTIONS-NEXT: vcmp.u32 hi, q3, q2 -; NOREDUCTIONS-NEXT: vdup.32 q3, r6 -; NOREDUCTIONS-NEXT: vpnot +; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1 ; NOREDUCTIONS-NEXT: sub.w r8, r8, #1 -; NOREDUCTIONS-NEXT: vpsttt -; NOREDUCTIONS-NEXT: vcmpt.u32 cs, q3, q2 -; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r0], #8 -; NOREDUCTIONS-NEXT: vldrht.s32 q3, [r4], #8 -; NOREDUCTIONS-NEXT: adds r7, #4 -; NOREDUCTIONS-NEXT: vmul.i32 q2, q3, q2 -; NOREDUCTIONS-NEXT: vshl.s32 q2, r5 -; NOREDUCTIONS-NEXT: vadd.i32 q2, q2, q1 -; NOREDUCTIONS-NEXT: le lr, .LBB0_6 +; NOREDUCTIONS-NEXT: vshl.s32 q1, r5 +; NOREDUCTIONS-NEXT: vadd.i32 q0, q1, q0 +; NOREDUCTIONS-NEXT: letp lr, .LBB0_6 ; NOREDUCTIONS-NEXT: @ %bb.7: @ %middle.block ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; NOREDUCTIONS-NEXT: vpsel q1, q2, q1 -; NOREDUCTIONS-NEXT: vaddv.u32 r0, q1 +; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 ; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17 -; NOREDUCTIONS-NEXT: add sp, #8 -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} -; NOREDUCTIONS-NEXT: .p2align 4 -; NOREDUCTIONS-NEXT: @ %bb.9: -; NOREDUCTIONS-NEXT: .LCPI0_0: -; NOREDUCTIONS-NEXT: .long 0 @ 0x0 -; NOREDUCTIONS-NEXT: .long 1 @ 0x1 -; NOREDUCTIONS-NEXT: .long 2 @ 0x2 -; NOREDUCTIONS-NEXT: .long 3 @ 0x3 +; NOREDUCTIONS-NEXT: add sp, #4 +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} ; -; FORCENOREDUCTIONS-LABEL: varying_outer_2d_reduction: -; FORCENOREDUCTIONS: @ %bb.0: @ %entry -; FORCENOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; FORCENOREDUCTIONS-NEXT: sub sp, #4 -; FORCENOREDUCTIONS-NEXT: cmp r3, #1 -; FORCENOREDUCTIONS-NEXT: str r0, [sp] @ 4-byte Spill -; FORCENOREDUCTIONS-NEXT: blt .LBB0_8 -; FORCENOREDUCTIONS-NEXT: @ %bb.1: @ %for.body.lr.ph -; FORCENOREDUCTIONS-NEXT: ldr r0, [sp, #36] -; FORCENOREDUCTIONS-NEXT: add.w r12, r2, #3 -; FORCENOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload -; FORCENOREDUCTIONS-NEXT: movs r6, #0 -; FORCENOREDUCTIONS-NEXT: mov r9, r12 -; FORCENOREDUCTIONS-NEXT: uxth r0, r0 -; FORCENOREDUCTIONS-NEXT: rsbs r5, r0, #0 -; FORCENOREDUCTIONS-NEXT: b .LBB0_4 -; FORCENOREDUCTIONS-NEXT: .LBB0_2: @ in Loop: Header=BB0_4 Depth=1 -; FORCENOREDUCTIONS-NEXT: movs r0, #0 -; FORCENOREDUCTIONS-NEXT: .LBB0_3: @ %for.end -; FORCENOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; FORCENOREDUCTIONS-NEXT: lsrs r0, r0, #16 -; FORCENOREDUCTIONS-NEXT: sub.w r9, r9, #1 -; FORCENOREDUCTIONS-NEXT: strh.w r0, [r1, r6, lsl #1] -; FORCENOREDUCTIONS-NEXT: adds r6, #1 -; FORCENOREDUCTIONS-NEXT: add.w r10, r10, #2 -; FORCENOREDUCTIONS-NEXT: cmp r6, r3 -; FORCENOREDUCTIONS-NEXT: beq .LBB0_8 -; FORCENOREDUCTIONS-NEXT: .LBB0_4: @ %for.body -; FORCENOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1 -; FORCENOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2 -; FORCENOREDUCTIONS-NEXT: cmp r2, r6 -; FORCENOREDUCTIONS-NEXT: ble .LBB0_2 -; FORCENOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph -; FORCENOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; FORCENOREDUCTIONS-NEXT: bic r0, r9, #3 -; FORCENOREDUCTIONS-NEXT: movs r7, #1 -; FORCENOREDUCTIONS-NEXT: subs r0, #4 -; FORCENOREDUCTIONS-NEXT: subs r4, r2, r6 -; FORCENOREDUCTIONS-NEXT: vmov.i32 q0, #0x0 -; FORCENOREDUCTIONS-NEXT: add.w r8, r7, r0, lsr #2 -; FORCENOREDUCTIONS-NEXT: mov r7, r10 -; FORCENOREDUCTIONS-NEXT: dlstp.32 lr, r4 -; FORCENOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload -; FORCENOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body -; FORCENOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 -; FORCENOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 -; FORCENOREDUCTIONS-NEXT: vldrh.s32 q1, [r0], #8 -; FORCENOREDUCTIONS-NEXT: vldrh.s32 q2, [r7], #8 -; FORCENOREDUCTIONS-NEXT: mov lr, r8 -; FORCENOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1 -; FORCENOREDUCTIONS-NEXT: sub.w r8, r8, #1 -; FORCENOREDUCTIONS-NEXT: vshl.s32 q1, r5 -; FORCENOREDUCTIONS-NEXT: vadd.i32 q0, q1, q0 -; FORCENOREDUCTIONS-NEXT: letp lr, .LBB0_6 -; FORCENOREDUCTIONS-NEXT: @ %bb.7: @ %middle.block -; FORCENOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; FORCENOREDUCTIONS-NEXT: vaddv.u32 r0, q0 -; FORCENOREDUCTIONS-NEXT: b .LBB0_3 -; FORCENOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17 -; FORCENOREDUCTIONS-NEXT: add sp, #4 -; FORCENOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %conv = sext i16 %N to i32 %cmp36 = icmp sgt i16 %N, 0