Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -299,7 +299,7 @@ // MVE vectors are 128-bit, but don't support 128 x i1. // TODO: Can we support vectors larger than 128-bits? unsigned MaxWidth = TTI->getRegisterBitWidth(true); - if (Lanes * ElementWidth != MaxWidth || Lanes == MaxWidth) + if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth) return false; MaskedInsts.push_back(cast(&I)); } else if (auto *Int = dyn_cast(&I)) { Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -9,41 +9,29 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q1, r2 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: adr r3, .LCPI0_0 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q2, r2 -; CHECK-NEXT: adds r3, r1, r2 -; CHECK-NEXT: adds r2, #4 -; CHECK-NEXT: vpt.u32 cs, q1, q4 -; CHECK-NEXT: vldrbt.u32 q4, [r3] -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmla.u32 q0, q4, r0 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: mov r12, r2 +; CHECK-NEXT: adds r2, r1, r3 +; CHECK-NEXT: vldrb.u32 q2, [r2] +; CHECK-NEXT: adds r3, #4 +; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmla.u32 q0, q2, r0 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph @@ -94,41 +82,27 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q1, r2 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: adr r3, .LCPI1_0 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q2, r2 -; CHECK-NEXT: adds r2, #4 -; CHECK-NEXT: vpt.u32 cs, q1, q4 -; CHECK-NEXT: vldrht.s32 q4, [r1] +; CHECK-NEXT: vldrh.s32 q2, [r1] +; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmla.u32 q0, q4, r0 -; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmla.u32 q0, q2, r0 +; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI1_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph @@ -179,41 +153,29 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q1, r2 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: adr r3, .LCPI2_0 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q2, r2 -; CHECK-NEXT: adds r3, r1, r2 -; CHECK-NEXT: adds r2, #4 -; CHECK-NEXT: vpt.u32 cs, q1, q4 -; CHECK-NEXT: vldrbt.u32 q4, [r3] -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmla.u32 q0, q4, r0 -; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: mov r12, r2 +; CHECK-NEXT: adds r2, r1, r3 +; CHECK-NEXT: vldrb.u32 q2, [r2] +; CHECK-NEXT: adds r3, #4 +; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmla.u32 q0, q2, r0 +; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI2_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph @@ -264,41 +226,27 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q1, r2 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: adr r3, .LCPI3_0 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q2, r2 -; CHECK-NEXT: adds r2, #4 -; CHECK-NEXT: vpt.u32 cs, q1, q4 -; CHECK-NEXT: vldrht.u32 q4, [r1] +; CHECK-NEXT: vldrh.u32 q2, [r1] +; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmla.u32 q0, q4, r0 -; CHECK-NEXT: le lr, .LBB3_1 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmla.u32 q0, q2, r0 +; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph @@ -414,66 +362,59 @@ ; CHECK-LABEL: test_vec_mul_scalar_add_char: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr r7, [sp, #28] -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB5_12 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r4, r3, r7, lsl #2 -; CHECK-NEXT: adds r5, r1, r7 -; CHECK-NEXT: cmp r4, r1 -; CHECK-NEXT: add.w r6, r0, r7 -; CHECK-NEXT: cset r12, hi -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r0 -; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: add.w r5, r3, r12, lsl #2 +; CHECK-NEXT: add.w r6, r1, r12 +; CHECK-NEXT: cmp r5, r1 +; CHECK-NEXT: add.w r4, r0, r12 +; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: ands r6, r4 -; CHECK-NEXT: lsls r6, r6, #31 +; CHECK-NEXT: cmp r5, r0 +; CHECK-NEXT: cset r5, hi +; CHECK-NEXT: cmp r4, r3 +; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: ands r5, r4 +; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r6, r5, r12 -; CHECK-NEXT: lslseq.w r6, r6, #31 +; CHECK-NEXT: andeq r7, r6 +; CHECK-NEXT: lslseq.w r7, r7, #31 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r6, r7, #1 -; CHECK-NEXT: and lr, r7, #3 -; CHECK-NEXT: cmp r6, #3 +; CHECK-NEXT: sub.w r4, r12, #1 +; CHECK-NEXT: and lr, r12, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB5_9 ; CHECK-NEXT: .LBB5_4: @ %vector.ph -; CHECK-NEXT: adds r6, r7, #3 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: bic r6, r6, #3 -; CHECK-NEXT: subs r7, #1 -; CHECK-NEXT: subs r6, #4 -; CHECK-NEXT: vdup.32 q0, r7 -; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: adr r6, .LCPI5_0 -; CHECK-NEXT: vldrw.u32 q1, [r6] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r7, r12, #3 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: bic r7, r7, #3 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r7 -; CHECK-NEXT: adds r4, r0, r7 -; CHECK-NEXT: vpt.u32 cs, q0, q2 -; CHECK-NEXT: vldrbt.u32 q2, [r4] -; CHECK-NEXT: adds r4, r1, r7 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u32 q3, [r4] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q2, q2, r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r3] +; CHECK-NEXT: adds r5, r0, r4 +; CHECK-NEXT: vldrb.u32 q0, [r5] +; CHECK-NEXT: adds r5, r1, r4 +; CHECK-NEXT: vldrb.u32 q1, [r5] +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: adds r7, #4 -; CHECK-NEXT: le lr, .LBB5_5 +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_12 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r12, lr, r7 +; CHECK-NEXT: sub.w r12, lr, r12 ; CHECK-NEXT: subs r4, r1, #3 ; CHECK-NEXT: subs r5, r0, #3 ; CHECK-NEXT: sub.w r7, r3, #16 @@ -517,13 +458,6 @@ ; CHECK-NEXT: le lr, .LBB5_11 ; CHECK-NEXT: .LBB5_12: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.13: -; CHECK-NEXT: .LCPI5_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %res12 = bitcast i32* %res to i8* %cmp10 = icmp eq i32 %N, 0 @@ -678,36 +612,21 @@ ; CHECK-NEXT: bic lr, lr, #3 ; CHECK-NEXT: sub.w lr, lr, #4 ; CHECK-NEXT: add.w lr, r4, lr, lsr #2 -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: adr r4, .LCPI6_0 -; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r12 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: vptt.u32 cs, q0, q2 -; CHECK-NEXT: vldrht.s32 q2, [r0] -; CHECK-NEXT: vldrht.s32 q3, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vadd.i32 q2, q2, r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: le lr, .LBB6_1 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: letp lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI6_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph @@ -754,66 +673,59 @@ ; CHECK-LABEL: test_vec_mul_scalar_add_uchar: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr r7, [sp, #28] -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB7_12 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r4, r3, r7, lsl #2 -; CHECK-NEXT: adds r5, r1, r7 -; CHECK-NEXT: cmp r4, r1 -; CHECK-NEXT: add.w r6, r0, r7 -; CHECK-NEXT: cset r12, hi -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r0 -; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: add.w r5, r3, r12, lsl #2 +; CHECK-NEXT: add.w r6, r1, r12 +; CHECK-NEXT: cmp r5, r1 +; CHECK-NEXT: add.w r4, r0, r12 +; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: ands r6, r4 -; CHECK-NEXT: lsls r6, r6, #31 +; CHECK-NEXT: cmp r5, r0 +; CHECK-NEXT: cset r5, hi +; CHECK-NEXT: cmp r4, r3 +; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: ands r5, r4 +; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r6, r5, r12 -; CHECK-NEXT: lslseq.w r6, r6, #31 +; CHECK-NEXT: andeq r7, r6 +; CHECK-NEXT: lslseq.w r7, r7, #31 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r6, r7, #1 -; CHECK-NEXT: and lr, r7, #3 -; CHECK-NEXT: cmp r6, #3 +; CHECK-NEXT: sub.w r4, r12, #1 +; CHECK-NEXT: and lr, r12, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB7_9 ; CHECK-NEXT: .LBB7_4: @ %vector.ph -; CHECK-NEXT: adds r6, r7, #3 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: bic r6, r6, #3 -; CHECK-NEXT: subs r7, #1 -; CHECK-NEXT: subs r6, #4 -; CHECK-NEXT: vdup.32 q0, r7 -; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: adr r6, .LCPI7_0 -; CHECK-NEXT: vldrw.u32 q1, [r6] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r7, r12, #3 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: bic r7, r7, #3 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r7 -; CHECK-NEXT: adds r4, r0, r7 -; CHECK-NEXT: vpt.u32 cs, q0, q2 -; CHECK-NEXT: vldrbt.u32 q2, [r4] -; CHECK-NEXT: adds r4, r1, r7 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u32 q3, [r4] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q2, q2, r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r3] +; CHECK-NEXT: adds r5, r0, r4 +; CHECK-NEXT: vldrb.u32 q0, [r5] +; CHECK-NEXT: adds r5, r1, r4 +; CHECK-NEXT: vldrb.u32 q1, [r5] +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: adds r7, #4 -; CHECK-NEXT: le lr, .LBB7_5 +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_12 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r12, lr, r7 +; CHECK-NEXT: sub.w r12, lr, r12 ; CHECK-NEXT: subs r4, r1, #3 ; CHECK-NEXT: subs r5, r0, #3 ; CHECK-NEXT: sub.w r7, r3, #16 @@ -857,13 +769,6 @@ ; CHECK-NEXT: le lr, .LBB7_11 ; CHECK-NEXT: .LBB7_12: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.13: -; CHECK-NEXT: .LCPI7_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %res12 = bitcast i32* %res to i8* %cmp10 = icmp eq i32 %N, 0 @@ -1018,36 +923,21 @@ ; CHECK-NEXT: bic lr, lr, #3 ; CHECK-NEXT: sub.w lr, lr, #4 ; CHECK-NEXT: add.w lr, r4, lr, lsr #2 -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: adr r4, .LCPI8_0 -; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r12 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: vptt.u32 cs, q0, q2 -; CHECK-NEXT: vldrht.u32 q2, [r0] -; CHECK-NEXT: vldrht.u32 q3, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vadd.i32 q2, q2, r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: le lr, .LBB8_1 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: letp lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI8_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -1,9 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=armv8.1m.main -mattr=+mve -S -mve-tail-predication -disable-mve-tail-predication=false %s -o - | FileCheck %s -; TODO: Support extending loads -; CHECK-LABEL: mat_vec_sext_i16 -; CHECK-NOT: call {{.*}} @llvm.arm.vctp define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { +; CHECK-LABEL: @mat_vec_sext_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] +; CHECK: for.cond1.preheader.us.preheader: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] +; CHECK: for.cond1.preheader.us: +; CHECK-NEXT: [[I_025_US:%.*]] = phi i32 [ [[INC10_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i16*, i16** [[A:%.*]], i32 [[I_025_US]] +; CHECK-NEXT: [[TMP3:%.*]] = load i16*, i16** [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_025_US]] +; CHECK-NEXT: [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX8_PROMOTED_US]], i32 0 +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef) +; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP11]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef) +; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD30]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP14]] = add nsw <4 x i32> [[TMP13]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP5]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP14]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP17]]) +; CHECK-NEXT: store i32 [[TMP18]], i32* [[ARRAYIDX8_US]], align 4 +; CHECK-NEXT: [[INC10_US]] = add nuw i32 [[I_025_US]], 1 +; CHECK-NEXT: [[EXITCOND27:%.*]] = icmp eq i32 [[INC10_US]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND27]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; entry: %cmp24 = icmp eq i32 %N, 0 br i1 %cmp24, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader @@ -64,16 +113,56 @@ ret void } -; CHECK-LABEL: mat_vec_i32 -; CHECK: phi -; CHECK: phi -; CHECK: phi -; CHECK: [[IV:%[^ ]+]] = phi i32 [ %N, %for.cond1.preheader.us ], [ [[REM:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[IV]]) -; CHECK: [[REM]] = sub i32 [[IV]], 4 -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { +; CHECK-LABEL: @mat_vec_i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP23:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP23]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] +; CHECK: for.cond1.preheader.us.preheader: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] +; CHECK: for.cond1.preheader.us: +; CHECK-NEXT: [[I_024_US:%.*]] = phi i32 [ [[INC9_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i32 [[I_024_US]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX7_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_024_US]] +; CHECK-NEXT: [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX7_PROMOTED_US]], i32 0 +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP12]] = add nsw <4 x i32> [[VEC_PHI]], [[TMP11]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP13]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP5]], i32 1) +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP12]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) +; CHECK-NEXT: store i32 [[TMP16]], i32* [[ARRAYIDX7_US]], align 4 +; CHECK-NEXT: [[INC9_US]] = add nuw i32 [[I_024_US]], 1 +; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[INC9_US]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND26]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; entry: %cmp23 = icmp eq i32 %N, 0 br i1 %cmp23, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader