diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -299,7 +299,7 @@ // MVE vectors are 128-bit, but don't support 128 x i1. // TODO: Can we support vectors larger than 128-bits? unsigned MaxWidth = TTI->getRegisterBitWidth(true); - if (Lanes * ElementWidth != MaxWidth || Lanes == MaxWidth) + if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth) return false; MaskedInsts.push_back(cast(&I)); } else if (auto *Int = dyn_cast(&I)) { diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -9,41 +9,29 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q1, r2 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: adr r3, .LCPI0_0 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q2, r2 -; CHECK-NEXT: adds r3, r1, r2 -; CHECK-NEXT: adds r2, #4 -; CHECK-NEXT: vpt.u32 cs, q1, q4 -; CHECK-NEXT: vldrbt.u32 q4, [r3] -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmla.u32 q0, q4, r0 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: mov r12, r2 +; CHECK-NEXT: adds r2, r1, r3 +; CHECK-NEXT: vldrb.u32 q2, [r2] +; CHECK-NEXT: adds r3, #4 +; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmla.u32 q0, q2, r0 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph @@ -94,41 +82,27 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q1, r2 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: adr r3, .LCPI1_0 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q2, r2 -; CHECK-NEXT: adds r2, #4 -; CHECK-NEXT: vpt.u32 cs, q1, q4 -; CHECK-NEXT: vldrht.s32 q4, [r1] +; CHECK-NEXT: vldrh.s32 q2, [r1] +; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmla.u32 q0, q4, r0 -; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmla.u32 q0, q2, r0 +; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI1_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph @@ -179,41 +153,29 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q1, r2 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: adr r3, .LCPI2_0 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q2, r2 -; CHECK-NEXT: adds r3, r1, r2 -; CHECK-NEXT: adds r2, #4 -; CHECK-NEXT: vpt.u32 cs, q1, q4 -; CHECK-NEXT: vldrbt.u32 q4, [r3] -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmla.u32 q0, q4, r0 -; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: mov r12, r2 +; CHECK-NEXT: adds r2, r1, r3 +; CHECK-NEXT: vldrb.u32 q2, [r2] +; CHECK-NEXT: adds r3, #4 +; CHECK-NEXT: sub.w r2, r12, #4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmla.u32 q0, q2, r0 +; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI2_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph @@ -264,41 +226,27 @@ ; CHECK-NEXT: moveq r0, #0 ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: vdup.32 q1, r2 ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: adr r3, .LCPI3_0 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q2, r2 -; CHECK-NEXT: adds r2, #4 -; CHECK-NEXT: vpt.u32 cs, q1, q4 -; CHECK-NEXT: vldrht.u32 q4, [r1] +; CHECK-NEXT: vldrh.u32 q2, [r1] +; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmla.u32 q0, q4, r0 -; CHECK-NEXT: le lr, .LBB3_1 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmla.u32 q0, q2, r0 +; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp7 = icmp eq i32 %N, 0 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph @@ -414,66 +362,59 @@ ; CHECK-LABEL: test_vec_mul_scalar_add_char: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr r7, [sp, #28] -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB5_12 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r4, r3, r7, lsl #2 -; CHECK-NEXT: adds r5, r1, r7 -; CHECK-NEXT: cmp r4, r1 -; CHECK-NEXT: add.w r6, r0, r7 -; CHECK-NEXT: cset r12, hi -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r0 -; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: add.w r5, r3, r12, lsl #2 +; CHECK-NEXT: add.w r6, r1, r12 +; CHECK-NEXT: cmp r5, r1 +; CHECK-NEXT: add.w r4, r0, r12 +; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: ands r6, r4 -; CHECK-NEXT: lsls r6, r6, #31 +; CHECK-NEXT: cmp r5, r0 +; CHECK-NEXT: cset r5, hi +; CHECK-NEXT: cmp r4, r3 +; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: ands r5, r4 +; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r6, r5, r12 -; CHECK-NEXT: lslseq.w r6, r6, #31 +; CHECK-NEXT: andeq r7, r6 +; CHECK-NEXT: lslseq.w r7, r7, #31 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r6, r7, #1 -; CHECK-NEXT: and lr, r7, #3 -; CHECK-NEXT: cmp r6, #3 +; CHECK-NEXT: sub.w r4, r12, #1 +; CHECK-NEXT: and lr, r12, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB5_9 ; CHECK-NEXT: .LBB5_4: @ %vector.ph -; CHECK-NEXT: adds r6, r7, #3 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: bic r6, r6, #3 -; CHECK-NEXT: subs r7, #1 -; CHECK-NEXT: subs r6, #4 -; CHECK-NEXT: vdup.32 q0, r7 -; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: adr r6, .LCPI5_0 -; CHECK-NEXT: vldrw.u32 q1, [r6] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r7, r12, #3 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: bic r7, r7, #3 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB5_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r7 -; CHECK-NEXT: adds r4, r0, r7 -; CHECK-NEXT: vpt.u32 cs, q0, q2 -; CHECK-NEXT: vldrbt.u32 q2, [r4] -; CHECK-NEXT: adds r4, r1, r7 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u32 q3, [r4] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q2, q2, r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r3] +; CHECK-NEXT: adds r5, r0, r4 +; CHECK-NEXT: vldrb.u32 q0, [r5] +; CHECK-NEXT: adds r5, r1, r4 +; CHECK-NEXT: vldrb.u32 q1, [r5] +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: adds r7, #4 -; CHECK-NEXT: le lr, .LBB5_5 +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: letp lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_12 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r12, lr, r7 +; CHECK-NEXT: sub.w r12, lr, r12 ; CHECK-NEXT: subs r4, r1, #3 ; CHECK-NEXT: subs r5, r0, #3 ; CHECK-NEXT: sub.w r7, r3, #16 @@ -517,13 +458,6 @@ ; CHECK-NEXT: le lr, .LBB5_11 ; CHECK-NEXT: .LBB5_12: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.13: -; CHECK-NEXT: .LCPI5_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %res12 = bitcast i32* %res to i8* %cmp10 = icmp eq i32 %N, 0 @@ -678,36 +612,21 @@ ; CHECK-NEXT: bic lr, lr, #3 ; CHECK-NEXT: sub.w lr, lr, #4 ; CHECK-NEXT: add.w lr, r4, lr, lsr #2 -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: adr r4, .LCPI6_0 -; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r12 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: vptt.u32 cs, q0, q2 -; CHECK-NEXT: vldrht.s32 q2, [r0] -; CHECK-NEXT: vldrht.s32 q3, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vadd.i32 q2, q2, r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: le lr, .LBB6_1 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: letp lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI6_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph @@ -754,66 +673,59 @@ ; CHECK-LABEL: test_vec_mul_scalar_add_uchar: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: ldr r7, [sp, #28] -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: ldr.w r12, [sp, #28] +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq.w .LBB7_12 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: add.w r4, r3, r7, lsl #2 -; CHECK-NEXT: adds r5, r1, r7 -; CHECK-NEXT: cmp r4, r1 -; CHECK-NEXT: add.w r6, r0, r7 -; CHECK-NEXT: cset r12, hi -; CHECK-NEXT: cmp r5, r3 -; CHECK-NEXT: cset r5, hi -; CHECK-NEXT: cmp r4, r0 -; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: add.w r5, r3, r12, lsl #2 +; CHECK-NEXT: add.w r6, r1, r12 +; CHECK-NEXT: cmp r5, r1 +; CHECK-NEXT: add.w r4, r0, r12 +; CHECK-NEXT: cset r7, hi ; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: cset r6, hi -; CHECK-NEXT: ands r6, r4 -; CHECK-NEXT: lsls r6, r6, #31 +; CHECK-NEXT: cmp r5, r0 +; CHECK-NEXT: cset r5, hi +; CHECK-NEXT: cmp r4, r3 +; CHECK-NEXT: cset r4, hi +; CHECK-NEXT: ands r5, r4 +; CHECK-NEXT: lsls r5, r5, #31 ; CHECK-NEXT: itt eq -; CHECK-NEXT: andeq.w r6, r5, r12 -; CHECK-NEXT: lslseq.w r6, r6, #31 +; CHECK-NEXT: andeq r7, r6 +; CHECK-NEXT: lslseq.w r7, r7, #31 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r6, r7, #1 -; CHECK-NEXT: and lr, r7, #3 -; CHECK-NEXT: cmp r6, #3 +; CHECK-NEXT: sub.w r4, r12, #1 +; CHECK-NEXT: and lr, r12, #3 +; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB7_9 ; CHECK-NEXT: .LBB7_4: @ %vector.ph -; CHECK-NEXT: adds r6, r7, #3 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: bic r6, r6, #3 -; CHECK-NEXT: subs r7, #1 -; CHECK-NEXT: subs r6, #4 -; CHECK-NEXT: vdup.32 q0, r7 -; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: add.w lr, r5, r6, lsr #2 -; CHECK-NEXT: adr r6, .LCPI7_0 -; CHECK-NEXT: vldrw.u32 q1, [r6] -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: add.w r7, r12, #3 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: bic r7, r7, #3 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: subs r7, #4 +; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB7_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r7 -; CHECK-NEXT: adds r4, r0, r7 -; CHECK-NEXT: vpt.u32 cs, q0, q2 -; CHECK-NEXT: vldrbt.u32 q2, [r4] -; CHECK-NEXT: adds r4, r1, r7 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u32 q3, [r4] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q2, q2, r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r3] +; CHECK-NEXT: adds r5, r0, r4 +; CHECK-NEXT: vldrb.u32 q0, [r5] +; CHECK-NEXT: adds r5, r1, r4 +; CHECK-NEXT: vldrb.u32 q1, [r5] +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: adds r7, #4 -; CHECK-NEXT: le lr, .LBB7_5 +; CHECK-NEXT: adds r4, #4 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: letp lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_12 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new -; CHECK-NEXT: sub.w r12, lr, r7 +; CHECK-NEXT: sub.w r12, lr, r12 ; CHECK-NEXT: subs r4, r1, #3 ; CHECK-NEXT: subs r5, r0, #3 ; CHECK-NEXT: sub.w r7, r3, #16 @@ -857,13 +769,6 @@ ; CHECK-NEXT: le lr, .LBB7_11 ; CHECK-NEXT: .LBB7_12: @ %for.cond.cleanup ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.13: -; CHECK-NEXT: .LCPI7_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %res12 = bitcast i32* %res to i8* %cmp10 = icmp eq i32 %N, 0 @@ -1018,36 +923,21 @@ ; CHECK-NEXT: bic lr, lr, #3 ; CHECK-NEXT: sub.w lr, lr, #4 ; CHECK-NEXT: add.w lr, r4, lr, lsr #2 -; CHECK-NEXT: sub.w r4, r12, #1 -; CHECK-NEXT: vdup.32 q0, r4 -; CHECK-NEXT: adr r4, .LCPI8_0 -; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q1, r12 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: vptt.u32 cs, q0, q2 -; CHECK-NEXT: vldrht.u32 q2, [r0] -; CHECK-NEXT: vldrht.u32 q3, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vadd.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vadd.i32 q2, q2, r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 -; CHECK-NEXT: le lr, .LBB8_1 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: letp lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI8_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph @@ -1320,18 +1210,79 @@ br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body } -; Function Attrs: argmemonly nounwind readonly willreturn -declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #2 +define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture %a, i8* nocapture readonly %b, i8* nocapture readonly %c, i32 %N) { +; CHECK-LABEL: test_v8i8_to_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: add.w r12, r3, #7 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: bic r12, r12, #7 +; CHECK-NEXT: sub.w r12, r12, #8 +; CHECK-NEXT: add.w lr, lr, r12, lsr #3 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: dlstp.16 lr, lr +; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r4, r1, r12 +; CHECK-NEXT: vldrb.u16 q0, [r4] +; CHECK-NEXT: add.w r4, r2, r12 +; CHECK-NEXT: vldrb.u16 q1, [r4] +; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: add.w r12, r12, #8 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: letp lr, .LBB10_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, pc} +entry: + %cmp10 = icmp eq i32 %N, 0 + br i1 %cmp10, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 7 + %n.vec = and i32 %n.rnd.up, -8 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer + %induction = add <8 x i32> %broadcast.splat, + %0 = getelementptr inbounds i8, i8* %b, i32 %index + %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + %2 = bitcast i8* %0 to <8 x i8>* + %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef) + %3 = zext <8 x i8> %wide.masked.load to <8 x i16> + %4 = getelementptr inbounds i8, i8* %c, i32 %index + %5 = bitcast i8* %4 to <8 x i8>* + %wide.masked.load14 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %5, i32 1, <8 x i1> %1, <8 x i8> undef) + %6 = zext <8 x i8> %wide.masked.load14 to <8 x i16> + %7 = mul nuw <8 x i16> %6, %3 + %8 = getelementptr inbounds i16, i16* %a, i32 %index + %9 = bitcast i16* %8 to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %7, <8 x i16>* %9, i32 2, <8 x i1> %1) + %index.next = add i32 %index, 8 + %10 = icmp eq i32 %index.next, %n.vec + br i1 %10, label %for.cond.cleanup, label %vector.body -; Function Attrs: nounwind readnone willreturn -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #3 +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} -; Function Attrs: argmemonly nounwind readonly willreturn -declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #2 +declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) +declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -; Function Attrs: argmemonly nounwind readonly willreturn -declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2 -; Function Attrs: argmemonly nounwind willreturn -declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -1,9 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=armv8.1m.main -mattr=+mve -S -mve-tail-predication -disable-mve-tail-predication=false %s -o - | FileCheck %s -; TODO: Support extending loads -; CHECK-LABEL: mat_vec_sext_i16 -; CHECK-NOT: call {{.*}} @llvm.arm.vctp define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { +; CHECK-LABEL: @mat_vec_sext_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] +; CHECK: for.cond1.preheader.us.preheader: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] +; CHECK: for.cond1.preheader.us: +; CHECK-NEXT: [[I_025_US:%.*]] = phi i32 [ [[INC10_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i16*, i16** [[A:%.*]], i32 [[I_025_US]] +; CHECK-NEXT: [[TMP3:%.*]] = load i16*, i16** [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_025_US]] +; CHECK-NEXT: [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX8_PROMOTED_US]], i32 0 +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef) +; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP11]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef) +; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD30]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP14]] = add nsw <4 x i32> [[TMP13]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP5]], i32 1) +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP14]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP17]]) +; CHECK-NEXT: store i32 [[TMP18]], i32* [[ARRAYIDX8_US]], align 4 +; CHECK-NEXT: [[INC10_US]] = add nuw i32 [[I_025_US]], 1 +; CHECK-NEXT: [[EXITCOND27:%.*]] = icmp eq i32 [[INC10_US]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND27]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; entry: %cmp24 = icmp eq i32 %N, 0 br i1 %cmp24, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader @@ -64,16 +113,56 @@ ret void } -; CHECK-LABEL: mat_vec_i32 -; CHECK: phi -; CHECK: phi -; CHECK: phi -; CHECK: [[IV:%[^ ]+]] = phi i32 [ %N, %for.cond1.preheader.us ], [ [[REM:%[^ ]+]], %vector.body ] -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[IV]]) -; CHECK: [[REM]] = sub i32 [[IV]], 4 -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { +; CHECK-LABEL: @mat_vec_i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP23:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP23]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] +; CHECK: for.cond1.preheader.us.preheader: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[N_VEC]], -4 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] +; CHECK: for.cond1.preheader.us: +; CHECK-NEXT: [[I_024_US:%.*]] = phi i32 [ [[INC9_US:%.*]], [[MIDDLE_BLOCK:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i32 [[I_024_US]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX7_US:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[I_024_US]] +; CHECK-NEXT: [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ARRAYIDX7_PROMOTED_US]], i32 0 +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]]) +; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP12]] = add nsw <4 x i32> [[VEC_PHI]], [[TMP11]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP13]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP5]], i32 1) +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP12]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) +; CHECK-NEXT: store i32 [[TMP16]], i32* [[ARRAYIDX7_US]], align 4 +; CHECK-NEXT: [[INC9_US]] = add nuw i32 [[I_024_US]], 1 +; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[INC9_US]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND26]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; entry: %cmp23 = icmp eq i32 %N, 0 br i1 %cmp23, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir @@ -0,0 +1,531 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +--- | + define dso_local arm_aapcs_vfpcc void @test_wlstp8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i32 %N) { + entry: + %0 = add i32 %N, 15 + %1 = lshr i32 %0, 4 + %2 = shl nuw i32 %1, 4 + %3 = add i32 %2, -16 + %4 = lshr i32 %3, 4 + %n.vec = add nuw nsw i32 %4, 1 + %cmp = call i1 @llvm.test.set.loop.iterations.i32(i32 %n.vec) + br i1 %cmp, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %count = phi i32 [ %n.vec, %vector.ph ], [ %loop.dec, %vector.body ] + %5 = phi i32 [ %N, %vector.ph ], [ %7, %vector.body ] + %6 = call <16 x i1> @llvm.arm.vctp8(i32 %5) + %7 = sub i32 %5, 16 + %scevgep4 = getelementptr i8, i8* %b, i32 %index + %scevgep45 = bitcast i8* %scevgep4 to <16 x i8>* + %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %scevgep45, i32 1, <16 x i1> %6, <16 x i8> undef) + %scevgep2 = getelementptr i8, i8* %c, i32 %index + %scevgep23 = bitcast i8* %scevgep2 to <16 x i8>* + %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %scevgep23, i32 1, <16 x i1> %6, <16 x i8> undef) + %tmp5 = mul <16 x i8> %wide.masked.load14, %wide.masked.load + %scevgep = getelementptr i8, i8* %a, i32 %index + %scevgep1 = bitcast i8* %scevgep to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %tmp5, <16 x i8>* %scevgep1, i32 1, <16 x i1> %6) + %index.next = add i32 %index, 16 + %loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1) + %tmp8 = icmp eq i32 %loop.dec, 0 + br i1 %tmp8, label %for.cond.cleanup, label %vector.body + + for.cond.cleanup: ; preds = %vector.body, %entry + ret void + } + + define dso_local arm_aapcs_vfpcc void @test_wlstp16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c, i32 %N) { + entry: + %0 = add i32 %N, 7 + %1 = lshr i32 %0, 3 + %2 = shl nuw i32 %1, 3 + %3 = add i32 %2, -8 + %4 = lshr i32 %3, 3 + %n.vec = add nuw nsw i32 %4, 1 + %cmp = call i1 @llvm.test.set.loop.iterations.i32(i32 %n.vec) + br i1 %cmp, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv5 = phi i16* [ %scevgep6, %vector.body ], [ %b, %vector.ph ] + %lsr.iv2 = phi i16* [ %scevgep3, %vector.body ], [ %c, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %count = phi i32 [ %n.vec, %vector.ph ], [ %loop.dec, %vector.body ] + %5 = phi i32 [ %N, %vector.ph ], [ %7, %vector.body ] + %lsr.iv57 = bitcast i16* %lsr.iv5 to <8 x i16>* + %lsr.iv24 = bitcast i16* %lsr.iv2 to <8 x i16>* + %lsr.iv1 = bitcast i16* %lsr.iv to <8 x i16>* + %6 = call <8 x i1> @llvm.arm.vctp16(i32 %5) + %7 = sub i32 %5, 8 + %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv57, i32 2, <8 x i1> %6, <8 x i16> undef) + %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %lsr.iv24, i32 2, <8 x i1> %6, <8 x i16> undef) + %tmp5 = mul <8 x i16> %wide.masked.load14, %wide.masked.load + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %tmp5, <8 x i16>* %lsr.iv1, i32 2, <8 x i1> %6) + %loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1) + %tmp8 = icmp eq i32 %loop.dec, 0 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 8 + %scevgep3 = getelementptr i16, i16* %lsr.iv2, i32 8 + %scevgep6 = getelementptr i16, i16* %lsr.iv5, i32 8 + br i1 %tmp8, label %for.cond.cleanup, label %vector.body + + for.cond.cleanup: ; preds = %vector.body, %entry + ret void + } + + define dso_local i32 @test_wlstp32(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { + entry: + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %n.vec = add nuw nsw i32 %4, 1 + %cmp = call i1 @llvm.test.set.loop.iterations.i32(i32 %n.vec) + br i1 %cmp, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %b, %vector.ph ] + %count = phi i32 [ %n.vec, %vector.ph ], [ %loop.dec, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp6, %vector.body ] + %5 = phi i32 [ %N, %vector.ph ], [ %7, %vector.body ] + %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>* + %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>* + %6 = call <4 x i1> @llvm.arm.vctp32(i32 %5) + %7 = sub i32 %5, 4 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %6, <4 x i32> undef) + %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %6, <4 x i32> undef) + %tmp5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load + %tmp6 = add nsw <4 x i32> %tmp5, %vec.phi + %loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1) + %tmp7 = icmp eq i32 %loop.dec, 0 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4 + br i1 %tmp7, label %middle.block, label %vector.body + + middle.block: ; preds = %vector.body + %8 = call <4 x i1> @llvm.arm.vctp32(i32 %5) + %tmp8 = select <4 x i1> %8, <4 x i32> %tmp6, <4 x i32> %vec.phi + %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp9, %middle.block ] + ret i32 %res.0.lcssa + } + + declare i1 @llvm.test.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) + declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) + declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) + declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + declare <16 x i1> @llvm.arm.vctp8(i32) + declare void @llvm.stackprotector(i8*, i8**) + declare <8 x i1> @llvm.arm.vctp16(i32) + declare <4 x i1> @llvm.arm.vctp32(i32) +... +--- +name: test_wlstp8 +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test_wlstp8 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: renamable $r12 = t2ADDri renamable $r3, 15, 14, $noreg, $noreg + ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 15, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 16, 14, $noreg, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 35, 14, $noreg, $noreg + ; CHECK: $lr = MVE_WLSTP_8 renamable $lr, %bb.1 + ; CHECK: tB %bb.3, 14, $noreg + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: renamable $r12 = t2MOVi 0, 14, $noreg, $noreg + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.3(0x04000000), %bb.2(0x7c000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12 + ; CHECK: renamable $r4 = t2ADDrr renamable $r1, renamable $r12, 14, $noreg, $noreg + ; CHECK: renamable $q0 = MVE_VLDRBU8 killed renamable $r4, 0, 0, $noreg :: (load 16 from %ir.scevgep45, align 1) + ; CHECK: renamable $r4 = t2ADDrr renamable $r2, renamable $r12, 14, $noreg, $noreg + ; CHECK: renamable $q1 = MVE_VLDRBU8 killed renamable $r4, 0, 0, $noreg :: (load 16 from %ir.scevgep23, align 1) + ; CHECK: renamable $r4 = t2ADDrr renamable $r0, renamable $r12, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2ADDri killed renamable $r12, 16, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg + ; CHECK: renamable $q0 = MVE_VMULt1i8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: MVE_VSTRBU8 killed renamable $q0, killed renamable $r4, 0, 0, killed $noreg :: (store 16 into %ir.scevgep1, align 1) + ; CHECK: $lr = MVE_LETP renamable $lr, %bb.2 + ; CHECK: bb.3.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r4, def $pc + bb.0.entry: + successors: %bb.3(0x40000000), %bb.1(0x40000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + + frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r4, -8 + renamable $r12 = t2ADDri renamable $r3, 15, 14, $noreg, $noreg + renamable $lr = t2MOVi 1, 14, $noreg, $noreg + renamable $r12 = t2BICri killed renamable $r12, 15, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r12, 16, 14, $noreg, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 35, 14, $noreg, $noreg + t2WhileLoopStart renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.3, 14, $noreg + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $lr, $r0, $r1, $r2, $r3 + + renamable $r12 = t2MOVi 0, 14, $noreg, $noreg + + bb.2.vector.body: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + liveins: $lr, $r0, $r1, $r2, $r3, $r12 + + renamable $r4 = t2ADDrr renamable $r1, renamable $r12, 14, $noreg, $noreg + renamable $vpr = MVE_VCTP8 renamable $r3, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $q0 = MVE_VLDRBU8 killed renamable $r4, 0, 1, renamable $vpr :: (load 16 from %ir.scevgep45, align 1) + renamable $r4 = t2ADDrr renamable $r2, renamable $r12, 14, $noreg, $noreg + MVE_VPST 8, implicit $vpr + renamable $q1 = MVE_VLDRBU8 killed renamable $r4, 0, 1, renamable $vpr :: (load 16 from %ir.scevgep23, align 1) + renamable $r4 = t2ADDrr renamable $r0, renamable $r12, 14, $noreg, $noreg + renamable $r12 = t2ADDri killed renamable $r12, 16, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg + renamable $q0 = MVE_VMULt1i8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + MVE_VPST 8, implicit $vpr + MVE_VSTRBU8 killed renamable $q0, killed renamable $r4, 0, 1, killed renamable $vpr :: (store 16 into %ir.scevgep1, align 1) + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg + + bb.3.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r4, def $pc + +... +--- +name: test_wlstp16 +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test_wlstp16 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: renamable $r12 = t2ADDri renamable $r3, 7, 14, $noreg, $noreg + ; CHECK: renamable $lr = t2MOVi 1, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 7, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 8, 14, $noreg, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 27, 14, $noreg, $noreg + ; CHECK: $lr = MVE_WLSTP_16 renamable $lr, %bb.1 + ; CHECK: tB %bb.2, 14, $noreg + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: renamable $q0 = MVE_VLDRHU16 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv57, align 2) + ; CHECK: renamable $q1 = MVE_VLDRHU16 renamable $r2, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 2) + ; CHECK: renamable $q0 = MVE_VMULt1i16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: MVE_VSTRHU16 killed renamable $q0, renamable $r0, 0, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 2) + ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg + ; CHECK: renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 16, 14, $noreg + ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14, $noreg + ; CHECK: $lr = MVE_LETP renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc + bb.0.entry: + successors: %bb.2(0x40000000), %bb.1(0x40000000) + liveins: $r0, $r1, $r2, $r3, $r7, $lr + + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + renamable $r12 = t2ADDri renamable $r3, 7, 14, $noreg, $noreg + renamable $lr = t2MOVi 1, 14, $noreg, $noreg + renamable $r12 = t2BICri killed renamable $r12, 7, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r12, 8, 14, $noreg, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 27, 14, $noreg, $noreg + t2WhileLoopStart renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.1.vector.body: + successors: %bb.2(0x04000000), %bb.1(0x7c000000) + liveins: $lr, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $q0 = MVE_VLDRHU16 renamable $r1, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv57, align 2) + renamable $q1 = MVE_VLDRHU16 renamable $r2, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 2) + renamable $q0 = MVE_VMULt1i16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + MVE_VPST 8, implicit $vpr + MVE_VSTRHU16 killed renamable $q0, renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 2) + renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg + renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 16, 14, $noreg + renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14, $noreg + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14, $noreg + + bb.2.for.cond.cleanup: + tPOP_RET 14, $noreg, def $r7, def $pc + +... +--- +name: test_wlstp32 +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test_wlstp32 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.4(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $r0, $r1, $r2, $r7, $lr + ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2MOVi 0, 14, $noreg, $noreg + ; CHECK: $lr = MVE_WLSTP_32 renamable $lr, %bb.1 + ; CHECK: tB %bb.4, 14, $noreg + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 + ; CHECK: bb.2.vector.body: + ; CHECK: successors: %bb.3(0x04000000), %bb.2(0x7c000000) + ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 + ; CHECK: $q0 = MVE_VORR killed $q1, $q1, 0, $noreg, undef $q0 + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 4) + ; CHECK: renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 0, killed $noreg :: (load 16 from %ir.lsr.iv1, align 4) + ; CHECK: $r3 = tMOVr $r2, 14, $noreg + ; CHECK: renamable $q1 = nsw MVE_VMULt1i32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg + ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14, $noreg + ; CHECK: renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + ; CHECK: $lr = MVE_LETP renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: liveins: $q0, $q1, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 killed renamable $r3, 0, $noreg + ; CHECK: renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + ; CHECK: renamable $r12 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: bb.4.for.cond.cleanup: + ; CHECK: liveins: $r12 + ; CHECK: $r0 = tMOVr killed $r12, 14, $noreg + ; CHECK: tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.4(0x40000000), %bb.1(0x40000000) + liveins: $r0, $r1, $r2, $r7, $lr + + frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg + renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg + renamable $r12 = t2MOVi 0, 14, $noreg, $noreg + t2WhileLoopStart renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.4, 14, $noreg + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $lr, $r0, $r1, $r2 + + renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 + + bb.2.vector.body: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + liveins: $lr, $q1, $r0, $r1, $r2 + + $q0 = MVE_VORR killed $q1, $q1, 0, $noreg, undef $q0 + renamable $vpr = MVE_VCTP32 $r2, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) + renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) + $r3 = tMOVr $r2, 14, $noreg + renamable $q1 = nsw MVE_VMULt1i32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg + renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg + renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14, $noreg + renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14, $noreg + + bb.3.middle.block: + successors: %bb.4(0x80000000) + liveins: $q0, $q1, $r3 + + renamable $vpr = MVE_VCTP32 killed renamable $r3, 0, $noreg + renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr + renamable $r12 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + + bb.4.for.cond.cleanup: + liveins: $r12 + + $r0 = tMOVr killed $r12, 14, $noreg + tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0 + +...