diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -541,7 +541,6 @@ return !MF.getSubtarget().isThumb1Only(); })); } - addPass(createMVEVPTBlockPass()); addPass(createThumb2ITBlockPass()); // Add both scheduling passes to give the subtarget an opportunity to pick @@ -551,6 +550,7 @@ addPass(&PostRASchedulerID); } + addPass(createMVEVPTBlockPass()); addPass(createARMIndirectThunks()); addPass(createARMSLSHardeningPass()); } diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -151,12 +151,12 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: If Converter -; CHECK-NEXT: MVE VPT block insertion pass ; CHECK-NEXT: Thumb IT blocks insertion pass ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: PostRA Machine Instruction Scheduler ; CHECK-NEXT: Post RA top-down list latency scheduler +; CHECK-NEXT: MVE VPT block insertion pass ; CHECK-NEXT: ARM Indirect Thunks ; CHECK-NEXT: ARM sls hardening pass ; CHECK-NEXT: Analyze Machine Code For Garbage Collection diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/arm_cmplx_dot_prod_f32.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/arm_cmplx_dot_prod_f32.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/arm_cmplx_dot_prod_f32.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/arm_cmplx_dot_prod_f32.ll @@ -44,8 +44,8 @@ ; CHECK-NEXT: vcmla.f32 q0, q2, q1, #90 ; CHECK-NEXT: cbz r2, .LBB0_8 ; CHECK-NEXT: @ %bb.4: @ %while.body9 -; CHECK-NEXT: cmp r2, #4 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: cmp r2, #4 ; CHECK-NEXT: vpstttt ; CHECK-NEXT: vldrwt.u32 q1, [r1] ; CHECK-NEXT: vldrwt.u32 q2, [r0] diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -20,20 +20,20 @@ ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and r4, r12, #15 -; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r2], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: vdup.32 q3, r4 +; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vpt.i32 eq, q3, zr ; CHECK-NEXT: vmovt q1, q2 ; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vmul.i32 q1, q1, q2 -; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vmul.i32 q1, q1, q2 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block @@ -101,22 +101,22 @@ ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: and r5, r4, #15 ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r3], #16 ; CHECK-NEXT: vldrwt.u32 q3, [r2], #16 ; CHECK-NEXT: vdup.32 q4, r5 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vpt.i32 eq, q4, zr ; CHECK-NEXT: vsubt.i32 q1, q3, q2 ; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vmul.i32 q1, q1, q2 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vmul.i32 q1, q1, q2 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block @@ -200,8 +200,8 @@ ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vsub.i32 q1, q2, q1 ; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vsub.i32 q1, q2, q1 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vcmpt.i32 eq, q1, zr ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 @@ -288,13 +288,13 @@ ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: vsub.i32 q1, q2, q1 ; CHECK-NEXT: vpnot +; CHECK-NEXT: vsub.i32 q1, q2, q1 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpstee ; CHECK-NEXT: vcmpt.i32 ne, q1, zr ; CHECK-NEXT: vldrwe.u32 q1, [r3], #16 ; CHECK-NEXT: vldrwe.u32 q2, [r2], #16 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB3_2 @@ -415,8 +415,9 @@ ; CHECK-NEXT: .LBB5_2: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vptt.i32 ne, q0, zr +; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vcmpt.s32 le, q0, r2 +; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -214,10 +214,10 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vldrwt.u32 q3, [r1], #16 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vfma.f32 q0, q3, q2 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -21,9 +21,9 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrbt.u32 q2, [r1], #4 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block @@ -86,9 +86,9 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.s32 q2, [r1], #8 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block @@ -151,9 +151,9 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrbt.u32 q2, [r1], #4 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block @@ -216,9 +216,9 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.u32 q2, [r1], #8 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block @@ -281,9 +281,9 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB4_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -78,12 +78,11 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 ; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vadd.i16 q1, q0, q1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 ; CHECK-NEXT: vadd.i16 q1, q1, q2 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block @@ -559,9 +558,9 @@ ; CHECK-NEXT: vldrbt.u16 q1, [r3], #8 ; CHECK-NEXT: vldrbt.u16 q4, [r4], #8 ; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vsub.i16 q3, q4, q1 ; CHECK-NEXT: vmul.i16 q1, q4, q1 -; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vadd.i16 q3, q3, q2 ; CHECK-NEXT: vadd.i16 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB7_2 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll @@ -57,10 +57,10 @@ ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vctp.16 r6 +; CHECK-NEXT: subs r6, #8 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.u16 q0, [r5] ; CHECK-NEXT: vshr.u16 q1, q0, #3 -; CHECK-NEXT: subs r6, #8 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov q2, q4 ; CHECK-NEXT: vmla.u16 q2, q1, r2 @@ -237,10 +237,10 @@ ; CHECK-NEXT: @ Parent Loop BB1_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrh.u16 q0, [r5] -; CHECK-NEXT: vshl.i16 q1, q0, #3 -; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov.f64 d6, d4 ; CHECK-NEXT: vmov.f64 d7, d5 +; CHECK-NEXT: vshl.i16 q1, q0, #3 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov q2, q4 ; CHECK-NEXT: vmla.u16 q2, q1, r3 ; CHECK-NEXT: vshr.u16 q1, q0, #3 @@ -265,10 +265,10 @@ ; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.f64 d8, d10 ; CHECK-NEXT: vmov.f64 d9, d11 -; CHECK-NEXT: vmov.f64 d10, d14 -; CHECK-NEXT: vmov.f64 d11, d15 ; CHECK-NEXT: vand q1, q1, q3 ; CHECK-NEXT: vorr q0, q0, q1 +; CHECK-NEXT: vmov.f64 d10, d14 +; CHECK-NEXT: vmov.f64 d11, d15 ; CHECK-NEXT: vstrh.16 q0, [r5], #16 ; CHECK-NEXT: letp lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll @@ -57,13 +57,13 @@ ; DISABLED-NEXT: .LBB0_3: @ %vector.body ; DISABLED-NEXT: @ Parent Loop BB0_2 Depth=1 ; DISABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; DISABLED-NEXT: mov lr, r7 ; DISABLED-NEXT: vctp.32 r6 -; DISABLED-NEXT: subs r7, #1 -; DISABLED-NEXT: subs r6, #4 +; DISABLED-NEXT: mov lr, r7 ; DISABLED-NEXT: vpstt ; DISABLED-NEXT: vldrwt.u32 q0, [r5], #16 ; DISABLED-NEXT: vldrwt.u32 q1, [r4], #16 +; DISABLED-NEXT: subs r7, #1 +; DISABLED-NEXT: subs r6, #4 ; DISABLED-NEXT: vadd.i32 q0, q1, q0 ; DISABLED-NEXT: vpst ; DISABLED-NEXT: vstrwt.32 q0, [r12], #16 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll @@ -63,9 +63,9 @@ ; CHECK-NEXT: .LBB1_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmlava.s32 r12, q0, q1 ; CHECK-NEXT: le lr, .LBB1_1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -71,10 +71,10 @@ ; ENABLED-NEXT: vldrht.s32 q1, [r0], #8 ; ENABLED-NEXT: vldrht.s32 q2, [r7], #8 ; ENABLED-NEXT: mov lr, r6 -; ENABLED-NEXT: vmul.i32 q1, q2, q1 ; ENABLED-NEXT: subs r6, #1 -; ENABLED-NEXT: vshl.s32 q1, r5 +; ENABLED-NEXT: vmul.i32 q1, q2, q1 ; ENABLED-NEXT: subs r4, #4 +; ENABLED-NEXT: vshl.s32 q1, r5 ; ENABLED-NEXT: vadd.i32 q1, q1, q0 ; ENABLED-NEXT: le lr, .LBB0_6 ; ENABLED-NEXT: @ %bb.7: @ %middle.block @@ -142,10 +142,10 @@ ; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8 ; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8 ; NOREDUCTIONS-NEXT: mov lr, r6 -; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1 ; NOREDUCTIONS-NEXT: subs r6, #1 -; NOREDUCTIONS-NEXT: vshl.s32 q1, r5 +; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1 ; NOREDUCTIONS-NEXT: subs r4, #4 +; NOREDUCTIONS-NEXT: vshl.s32 q1, r5 ; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0 ; NOREDUCTIONS-NEXT: le lr, .LBB0_6 ; NOREDUCTIONS-NEXT: @ %bb.7: @ %middle.block diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -342,11 +342,11 @@ ; CHECK-NEXT: add r0, sp, #88 ; CHECK-NEXT: vcmp.i8 ne, q3, zr ; CHECK-NEXT: vldr d1, [sp, #80] -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpnot -; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpst ; CHECK-NEXT: vcmpt.i8 ne, q2, zr +; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll --- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll +++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll @@ -453,8 +453,8 @@ ; CHECK-NEXT: @ Parent Loop BB1_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 ; CHECK-NEXT: vshl.i32 q2, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, q6 ; CHECK-NEXT: vadd.i32 q2, q2, r10 +; CHECK-NEXT: vadd.i32 q1, q1, q6 ; CHECK-NEXT: vstrw.32 q0, [q2] ; CHECK-NEXT: letp lr, .LBB1_10 ; CHECK-NEXT: b .LBB1_13 @@ -467,8 +467,8 @@ ; CHECK-NEXT: @ Parent Loop BB1_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 ; CHECK-NEXT: vshl.i32 q2, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, q5 ; CHECK-NEXT: vadd.i32 q2, q2, r10 +; CHECK-NEXT: vadd.i32 q1, q1, q5 ; CHECK-NEXT: vstrw.32 q0, [q2] ; CHECK-NEXT: letp lr, .LBB1_12 ; CHECK-NEXT: .LBB1_13: @ %for.cond9.for.cond15.preheader_crit_edge.us diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -835,10 +835,10 @@ ; CHECK-NEXT: and r5, r3, #3 ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vctp.16 r5 +; CHECK-NEXT: add.w r1, r10, #2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrht.16 q0, [r4] ; CHECK-NEXT: vldrw.u32 q0, [r10] -; CHECK-NEXT: add.w r1, r10, #2 ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: add.w r1, r10, #6 ; CHECK-NEXT: vmul.f16 q0, q0, r7 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -1481,9 +1481,9 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q4, [q3, #128]! ; CHECK-NEXT: vldrw.u32 q5, [q2, #128]! +; CHECK-NEXT: vldrw.u32 q6, [q0, #128]! ; CHECK-NEXT: vadd.i32 q4, q5, q4 ; CHECK-NEXT: vldrw.u32 q5, [q1, #128]! -; CHECK-NEXT: vldrw.u32 q6, [q0, #128]! ; CHECK-NEXT: vadd.i32 q4, q4, q5 ; CHECK-NEXT: vadd.i32 q4, q4, q6 ; CHECK-NEXT: vstrw.32 q4, [r0], #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -231,8 +231,8 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru_icmp1(i16* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i16 q0, #0x1 ; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vmov.i16 q0, #0x1 ; CHECK-NEXT: vpt.s16 gt, q1, zr ; CHECK-NEXT: vldrht.u16 q2, [r0, q1, uxtw #1] ; CHECK-NEXT: vpsel q0, q2, q0 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -152,13 +152,13 @@ ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q3, [r4], #16 ; CHECK-NEXT: vmul.i32 q3, q3, q2 -; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q3 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q3, [q1, #80]! +; CHECK-NEXT: vadd.i32 q0, q0, q3 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vaddv.u32 r0, q0 @@ -244,13 +244,13 @@ ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrb.u32 q7, [r0, q1] ; CHECK-NEXT: vldrb.u32 q5, [r0, q2] +; CHECK-NEXT: vldrb.u32 q6, [r0, q3] +; CHECK-NEXT: vldrb.u32 q7, [r0, q1] +; CHECK-NEXT: adds r0, #12 ; CHECK-NEXT: vmul.i32 q4, q5, r8 ; CHECK-NEXT: vmla.u32 q4, q7, r9 -; CHECK-NEXT: vldrb.u32 q6, [r0, q3] ; CHECK-NEXT: vmla.u32 q4, q6, r12 -; CHECK-NEXT: adds r0, #12 ; CHECK-NEXT: vadd.i32 q4, q4, q0 ; CHECK-NEXT: vshr.u32 q4, q4, #16 ; CHECK-NEXT: vstrb.32 q4, [r1, q1] @@ -263,8 +263,8 @@ ; CHECK-NEXT: vmla.u32 q4, q6, r4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: vadd.i32 q4, q4, q0 -; CHECK-NEXT: vshr.u32 q1, q1, #16 ; CHECK-NEXT: vshr.u32 q4, q4, #16 +; CHECK-NEXT: vshr.u32 q1, q1, #16 ; CHECK-NEXT: vstrb.32 q4, [r1, q2] ; CHECK-NEXT: vstrb.32 q1, [r1, q3] ; CHECK-NEXT: adds r1, #12 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll @@ -1543,8 +1543,8 @@ define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #4] ; CHECK-NEXT: bx lr @@ -1562,9 +1562,9 @@ define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -1582,9 +1582,9 @@ define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_2: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adds r1, r0, #2 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -1602,8 +1602,8 @@ define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #508] ; CHECK-NEXT: bx lr @@ -1621,9 +1621,9 @@ define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_512: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #512 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -1641,8 +1641,8 @@ define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_m508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] ; CHECK-NEXT: bx lr @@ -1660,9 +1660,9 @@ define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_m512: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #512 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -1680,8 +1680,8 @@ define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #4] ; CHECK-NEXT: bx lr @@ -1699,9 +1699,9 @@ define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -1719,8 +1719,8 @@ define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #2] ; CHECK-NEXT: bx lr @@ -1738,8 +1738,8 @@ define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #254] ; CHECK-NEXT: bx lr @@ -1757,9 +1757,9 @@ define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #256 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -1777,8 +1777,8 @@ define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_m254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #-254] ; CHECK-NEXT: bx lr @@ -1796,9 +1796,9 @@ define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_m256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #256 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -1816,8 +1816,8 @@ define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #4] ; CHECK-NEXT: bx lr @@ -1835,9 +1835,9 @@ define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr @@ -1855,8 +1855,8 @@ define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #2] ; CHECK-NEXT: bx lr @@ -1874,8 +1874,8 @@ define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #254] ; CHECK-NEXT: bx lr @@ -1893,9 +1893,9 @@ define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #256 -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr @@ -1913,8 +1913,8 @@ define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_m254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #-254] ; CHECK-NEXT: bx lr @@ -1932,9 +1932,9 @@ define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_m256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #256 -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr @@ -1952,8 +1952,8 @@ define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #4] ; CHECK-NEXT: bx lr @@ -1971,8 +1971,8 @@ define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #3] ; CHECK-NEXT: bx lr @@ -1990,8 +1990,8 @@ define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #2] ; CHECK-NEXT: bx lr @@ -2009,8 +2009,8 @@ define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #127] ; CHECK-NEXT: bx lr @@ -2028,9 +2028,9 @@ define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #128 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -2048,8 +2048,8 @@ define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_m127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #-127] ; CHECK-NEXT: bx lr @@ -2067,9 +2067,9 @@ define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_m128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #128 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -2087,8 +2087,8 @@ define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #4] ; CHECK-NEXT: bx lr @@ -2106,8 +2106,8 @@ define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #3] ; CHECK-NEXT: bx lr @@ -2125,8 +2125,8 @@ define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #2] ; CHECK-NEXT: bx lr @@ -2144,8 +2144,8 @@ define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #127] ; CHECK-NEXT: bx lr @@ -2163,9 +2163,9 @@ define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #128 -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r1] ; CHECK-NEXT: bx lr @@ -2183,8 +2183,8 @@ define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_m127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #-127] ; CHECK-NEXT: bx lr @@ -2202,9 +2202,9 @@ define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_m128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #128 -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r1] ; CHECK-NEXT: bx lr @@ -2222,8 +2222,8 @@ define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #4] ; CHECK-NEXT: bx lr @@ -2241,8 +2241,8 @@ define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #3] ; CHECK-NEXT: bx lr @@ -2260,8 +2260,8 @@ define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #2] ; CHECK-NEXT: bx lr @@ -2279,8 +2279,8 @@ define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #127] ; CHECK-NEXT: bx lr @@ -2298,9 +2298,9 @@ define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #128 -; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r1] ; CHECK-NEXT: bx lr @@ -2318,8 +2318,8 @@ define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_m127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #-127] ; CHECK-NEXT: bx lr @@ -2337,9 +2337,9 @@ define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_m128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #128 -; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r1] ; CHECK-NEXT: bx lr @@ -2357,8 +2357,8 @@ define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #4] ; CHECK-NEXT: bx lr @@ -2376,9 +2376,9 @@ define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -2396,9 +2396,9 @@ define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_2: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adds r1, r0, #2 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -2416,8 +2416,8 @@ define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #508] ; CHECK-NEXT: bx lr @@ -2435,9 +2435,9 @@ define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_512: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #512 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -2455,8 +2455,8 @@ define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_m508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] ; CHECK-NEXT: bx lr @@ -2474,9 +2474,9 @@ define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_m512: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #512 -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r1] ; CHECK-NEXT: bx lr @@ -2494,8 +2494,8 @@ define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #4] ; CHECK-NEXT: bx lr @@ -2513,9 +2513,9 @@ define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr @@ -2533,8 +2533,8 @@ define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #2] ; CHECK-NEXT: bx lr @@ -2552,8 +2552,8 @@ define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #254] ; CHECK-NEXT: bx lr @@ -2571,9 +2571,9 @@ define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: add.w r1, r0, #256 -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr @@ -2591,8 +2591,8 @@ define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_m254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #-254] ; CHECK-NEXT: bx lr @@ -2610,9 +2610,9 @@ define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_m256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: sub.w r1, r0, #256 -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r1] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll @@ -1543,8 +1543,8 @@ define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0], #4 ; CHECK-NEXT: bx lr @@ -1562,8 +1562,8 @@ define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: adds r0, #3 @@ -1582,8 +1582,8 @@ define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: adds r0, #2 @@ -1602,8 +1602,8 @@ define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0], #508 ; CHECK-NEXT: bx lr @@ -1621,8 +1621,8 @@ define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_512: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: add.w r0, r0, #512 @@ -1641,8 +1641,8 @@ define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_m508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0], #-508 ; CHECK-NEXT: bx lr @@ -1660,8 +1660,8 @@ define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_m512: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: sub.w r0, r0, #512 @@ -1680,8 +1680,8 @@ define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0], #4 ; CHECK-NEXT: bx lr @@ -1699,8 +1699,8 @@ define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0] ; CHECK-NEXT: adds r0, #3 @@ -1719,8 +1719,8 @@ define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0], #2 ; CHECK-NEXT: bx lr @@ -1738,8 +1738,8 @@ define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0], #254 ; CHECK-NEXT: bx lr @@ -1757,8 +1757,8 @@ define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0] ; CHECK-NEXT: add.w r0, r0, #256 @@ -1777,8 +1777,8 @@ define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_m254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0], #-254 ; CHECK-NEXT: bx lr @@ -1796,8 +1796,8 @@ define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_m256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0] ; CHECK-NEXT: sub.w r0, r0, #256 @@ -1816,8 +1816,8 @@ define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0], #4 ; CHECK-NEXT: bx lr @@ -1835,8 +1835,8 @@ define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: adds r0, #3 @@ -1855,8 +1855,8 @@ define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0], #2 ; CHECK-NEXT: bx lr @@ -1874,8 +1874,8 @@ define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0], #254 ; CHECK-NEXT: bx lr @@ -1893,8 +1893,8 @@ define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: add.w r0, r0, #256 @@ -1913,8 +1913,8 @@ define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_m254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0], #-254 ; CHECK-NEXT: bx lr @@ -1932,8 +1932,8 @@ define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_m256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: sub.w r0, r0, #256 @@ -1952,8 +1952,8 @@ define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0], #4 ; CHECK-NEXT: bx lr @@ -1971,8 +1971,8 @@ define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0], #3 ; CHECK-NEXT: bx lr @@ -1990,8 +1990,8 @@ define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0], #2 ; CHECK-NEXT: bx lr @@ -2009,8 +2009,8 @@ define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0], #127 ; CHECK-NEXT: bx lr @@ -2028,8 +2028,8 @@ define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0] ; CHECK-NEXT: adds r0, #128 @@ -2048,8 +2048,8 @@ define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_m127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0], #-127 ; CHECK-NEXT: bx lr @@ -2067,8 +2067,8 @@ define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_m128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0] ; CHECK-NEXT: subs r0, #128 @@ -2087,8 +2087,8 @@ define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0], #4 ; CHECK-NEXT: bx lr @@ -2106,8 +2106,8 @@ define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0], #3 ; CHECK-NEXT: bx lr @@ -2125,8 +2125,8 @@ define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0], #2 ; CHECK-NEXT: bx lr @@ -2144,8 +2144,8 @@ define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0], #127 ; CHECK-NEXT: bx lr @@ -2163,8 +2163,8 @@ define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0] ; CHECK-NEXT: adds r0, #128 @@ -2183,8 +2183,8 @@ define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_m127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0], #-127 ; CHECK-NEXT: bx lr @@ -2202,8 +2202,8 @@ define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_m128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0] ; CHECK-NEXT: subs r0, #128 @@ -2222,8 +2222,8 @@ define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0], #4 ; CHECK-NEXT: bx lr @@ -2241,8 +2241,8 @@ define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0], #3 ; CHECK-NEXT: bx lr @@ -2260,8 +2260,8 @@ define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0], #2 ; CHECK-NEXT: bx lr @@ -2279,8 +2279,8 @@ define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0], #127 ; CHECK-NEXT: bx lr @@ -2298,8 +2298,8 @@ define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0] ; CHECK-NEXT: adds r0, #128 @@ -2318,8 +2318,8 @@ define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_m127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0], #-127 ; CHECK-NEXT: bx lr @@ -2337,8 +2337,8 @@ define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_m128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0] ; CHECK-NEXT: subs r0, #128 @@ -2357,8 +2357,8 @@ define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0], #4 ; CHECK-NEXT: bx lr @@ -2376,8 +2376,8 @@ define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: adds r0, #3 @@ -2396,8 +2396,8 @@ define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: adds r0, #2 @@ -2416,8 +2416,8 @@ define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0], #508 ; CHECK-NEXT: bx lr @@ -2435,8 +2435,8 @@ define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_512: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: add.w r0, r0, #512 @@ -2455,8 +2455,8 @@ define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_m508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0], #-508 ; CHECK-NEXT: bx lr @@ -2474,8 +2474,8 @@ define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_m512: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: sub.w r0, r0, #512 @@ -2494,8 +2494,8 @@ define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0], #4 ; CHECK-NEXT: bx lr @@ -2513,8 +2513,8 @@ define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: adds r0, #3 @@ -2533,8 +2533,8 @@ define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0], #2 ; CHECK-NEXT: bx lr @@ -2552,8 +2552,8 @@ define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0], #254 ; CHECK-NEXT: bx lr @@ -2571,8 +2571,8 @@ define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: add.w r0, r0, #256 @@ -2591,8 +2591,8 @@ define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_m254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0], #-254 ; CHECK-NEXT: bx lr @@ -2610,8 +2610,8 @@ define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_m256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: sub.w r0, r0, #256 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll @@ -24,8 +24,8 @@ define i8* @ldrwu32_3(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrwu32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -44,8 +44,8 @@ define i8* @ldrwu32_2(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrwu32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -83,8 +83,8 @@ define i8* @ldrwu32_512(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrwu32_512: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r0, r0, #512 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add.w r0, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -122,8 +122,8 @@ define i8* @ldrwu32_m512(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrwu32_m512: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: sub.w r0, r0, #512 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: sub.w r0, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -162,8 +162,8 @@ define i8* @ldrhu32_3(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrhu32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -223,8 +223,8 @@ define i8* @ldrhu32_256(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrhu32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -264,8 +264,8 @@ define i8* @ldrhu32_m256(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrhu32_m256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: sub.w r0, r0, #256 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: sub.w r0, r0, #256 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -305,8 +305,8 @@ define i8* @ldrhs32_3(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrhs32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -366,8 +366,8 @@ define i8* @ldrhs32_256(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrhs32_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -407,8 +407,8 @@ define i8* @ldrhs32_m256(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrhs32_m256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: sub.w r0, r0, #256 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: sub.w r0, r0, #256 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrht.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -447,8 +447,8 @@ define i8* @ldrhu16_3(i8* %x, i8* %y, <8 x i16> *%m) { ; CHECK-LABEL: ldrhu16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] @@ -505,8 +505,8 @@ define i8* @ldrhu16_256(i8* %x, i8* %y, <8 x i16> *%m) { ; CHECK-LABEL: ldrhu16_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] @@ -544,8 +544,8 @@ define i8* @ldrhu16_m256(i8* %x, i8* %y, <8 x i16> *%m) { ; CHECK-LABEL: ldrhu16_m256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: sub.w r0, r0, #256 ; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: sub.w r0, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] @@ -644,8 +644,8 @@ define i8* @ldrbu32_128(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrbu32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -685,8 +685,8 @@ define i8* @ldrbu32_m128(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrbu32_m128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -786,8 +786,8 @@ define i8* @ldrbs32_128(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrbs32_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -827,8 +827,8 @@ define i8* @ldrbs32_m128(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrbs32_m128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrbt.s32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -928,8 +928,8 @@ define i8* @ldrbu16_128(i8* %x, i8* %y, <8 x i16> *%m) { ; CHECK-LABEL: ldrbu16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.u16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] @@ -969,8 +969,8 @@ define i8* @ldrbu16_m128(i8* %x, i8* %y, <8 x i16> *%m) { ; CHECK-LABEL: ldrbu16_m128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.u16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] @@ -1070,8 +1070,8 @@ define i8* @ldrbs16_128(i8* %x, i8* %y, <8 x i16> *%m) { ; CHECK-LABEL: ldrbs16_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.s16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] @@ -1111,8 +1111,8 @@ define i8* @ldrbs16_m128(i8* %x, i8* %y, <8 x i16> *%m) { ; CHECK-LABEL: ldrbs16_m128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrbt.s16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] @@ -1208,8 +1208,8 @@ define i8* @ldrbu8_128(i8* %x, i8* %y, <16 x i8> *%m) { ; CHECK-LABEL: ldrbu8_128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vpt.i8 ne, q0, zr ; CHECK-NEXT: vldrbt.u8 q0, [r0] ; CHECK-NEXT: vstrb.8 q0, [r1] @@ -1247,8 +1247,8 @@ define i8* @ldrbu8_m128(i8* %x, i8* %y, <16 x i8> *%m) { ; CHECK-LABEL: ldrbu8_m128: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vpt.i8 ne, q0, zr ; CHECK-NEXT: vldrbt.u8 q0, [r0] ; CHECK-NEXT: vstrb.8 q0, [r1] @@ -1286,8 +1286,8 @@ define i8* @ldrwf32_3(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrwf32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -1306,8 +1306,8 @@ define i8* @ldrwf32_2(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrwf32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -1345,8 +1345,8 @@ define i8* @ldrwf32_512(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrwf32_512: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r0, r0, #512 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add.w r0, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -1384,8 +1384,8 @@ define i8* @ldrwf32_m512(i8* %x, i8* %y, <4 x i32> *%m) { ; CHECK-LABEL: ldrwf32_m512: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: sub.w r0, r0, #512 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: sub.w r0, r0, #512 ; CHECK-NEXT: vpt.i32 ne, q0, zr ; CHECK-NEXT: vldrwt.u32 q0, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -1423,8 +1423,8 @@ define i8* @ldrhf16_3(i8* %x, i8* %y, <8 x i16> *%m) { ; CHECK-LABEL: ldrhf16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] @@ -1481,8 +1481,8 @@ define i8* @ldrhf16_256(i8* %x, i8* %y, <8 x i16> *%m) { ; CHECK-LABEL: ldrhf16_256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] @@ -1520,8 +1520,8 @@ define i8* @ldrhf16_m256(i8* %x, i8* %y, <8 x i16> *%m) { ; CHECK-LABEL: ldrhf16_m256: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: sub.w r0, r0, #256 ; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: sub.w r0, r0, #256 ; CHECK-NEXT: vpt.i16 ne, q0, zr ; CHECK-NEXT: vldrht.u16 q0, [r0] ; CHECK-NEXT: vstrh.16 q0, [r1] @@ -1543,8 +1543,8 @@ define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #4]! ; CHECK-NEXT: bx lr @@ -1562,9 +1562,9 @@ define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -1582,9 +1582,9 @@ define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_2: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -1602,8 +1602,8 @@ define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #508]! ; CHECK-NEXT: bx lr @@ -1621,9 +1621,9 @@ define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_512: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: add.w r0, r0, #512 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -1641,8 +1641,8 @@ define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_m508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #-508]! ; CHECK-NEXT: bx lr @@ -1660,9 +1660,9 @@ define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strw32_m512: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: sub.w r0, r0, #512 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -1680,8 +1680,8 @@ define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #4]! ; CHECK-NEXT: bx lr @@ -1699,9 +1699,9 @@ define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -1719,8 +1719,8 @@ define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #2]! ; CHECK-NEXT: bx lr @@ -1738,8 +1738,8 @@ define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #254]! ; CHECK-NEXT: bx lr @@ -1757,9 +1757,9 @@ define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -1777,8 +1777,8 @@ define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_m254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0, #-254]! ; CHECK-NEXT: bx lr @@ -1796,9 +1796,9 @@ define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strh32_m256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: sub.w r0, r0, #256 ; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrht.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -1816,8 +1816,8 @@ define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #4]! ; CHECK-NEXT: bx lr @@ -1835,9 +1835,9 @@ define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: bx lr @@ -1855,8 +1855,8 @@ define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #2]! ; CHECK-NEXT: bx lr @@ -1874,8 +1874,8 @@ define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #254]! ; CHECK-NEXT: bx lr @@ -1893,9 +1893,9 @@ define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: bx lr @@ -1913,8 +1913,8 @@ define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_m254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #-254]! ; CHECK-NEXT: bx lr @@ -1932,9 +1932,9 @@ define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strh16_m256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: sub.w r0, r0, #256 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: bx lr @@ -1952,8 +1952,8 @@ define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #4]! ; CHECK-NEXT: bx lr @@ -1971,8 +1971,8 @@ define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #3]! ; CHECK-NEXT: bx lr @@ -1990,8 +1990,8 @@ define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #2]! ; CHECK-NEXT: bx lr @@ -2009,8 +2009,8 @@ define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #127]! ; CHECK-NEXT: bx lr @@ -2028,9 +2028,9 @@ define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -2048,8 +2048,8 @@ define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_m127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0, #-127]! ; CHECK-NEXT: bx lr @@ -2067,9 +2067,9 @@ define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strb32_m128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrbt.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -2087,8 +2087,8 @@ define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #4]! ; CHECK-NEXT: bx lr @@ -2106,8 +2106,8 @@ define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #3]! ; CHECK-NEXT: bx lr @@ -2125,8 +2125,8 @@ define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #2]! ; CHECK-NEXT: bx lr @@ -2144,8 +2144,8 @@ define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #127]! ; CHECK-NEXT: bx lr @@ -2163,9 +2163,9 @@ define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vldrb.u16 q0, [r1] -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0] ; CHECK-NEXT: bx lr @@ -2183,8 +2183,8 @@ define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_m127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrb.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0, #-127]! ; CHECK-NEXT: bx lr @@ -2202,9 +2202,9 @@ define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strb16_m128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vldrb.u16 q0, [r1] -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrbt.16 q0, [r0] ; CHECK-NEXT: bx lr @@ -2222,8 +2222,8 @@ define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #4]! ; CHECK-NEXT: bx lr @@ -2241,8 +2241,8 @@ define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #3]! ; CHECK-NEXT: bx lr @@ -2260,8 +2260,8 @@ define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #2]! ; CHECK-NEXT: bx lr @@ -2279,8 +2279,8 @@ define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #127]! ; CHECK-NEXT: bx lr @@ -2298,9 +2298,9 @@ define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vldrb.u8 q0, [r1] -; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0] ; CHECK-NEXT: bx lr @@ -2318,8 +2318,8 @@ define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_m127: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vldrb.u8 q0, [r1] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0, #-127]! ; CHECK-NEXT: bx lr @@ -2337,9 +2337,9 @@ define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) { ; CHECK-LABEL: strb8_m128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: subs r0, #128 ; CHECK-NEXT: vldrb.u8 q0, [r1] -; CHECK-NEXT: vldrb.u8 q1, [r2] ; CHECK-NEXT: vpt.i8 ne, q1, zr ; CHECK-NEXT: vstrbt.8 q0, [r0] ; CHECK-NEXT: bx lr @@ -2357,8 +2357,8 @@ define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #4]! ; CHECK-NEXT: bx lr @@ -2376,9 +2376,9 @@ define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -2396,9 +2396,9 @@ define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_2: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: adds r0, #2 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -2416,8 +2416,8 @@ define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #508]! ; CHECK-NEXT: bx lr @@ -2435,9 +2435,9 @@ define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_512: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: add.w r0, r0, #512 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -2455,8 +2455,8 @@ define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_m508: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0, #-508]! ; CHECK-NEXT: bx lr @@ -2474,9 +2474,9 @@ define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) { ; CHECK-LABEL: strwf32_m512: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: sub.w r0, r0, #512 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vpt.i32 ne, q1, zr ; CHECK-NEXT: vstrwt.32 q0, [r0] ; CHECK-NEXT: bx lr @@ -2494,8 +2494,8 @@ define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #4]! ; CHECK-NEXT: bx lr @@ -2513,9 +2513,9 @@ define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_3: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: adds r0, #3 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: bx lr @@ -2533,8 +2533,8 @@ define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #2]! ; CHECK-NEXT: bx lr @@ -2552,8 +2552,8 @@ define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #254]! ; CHECK-NEXT: bx lr @@ -2571,9 +2571,9 @@ define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: bx lr @@ -2591,8 +2591,8 @@ define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_m254: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0, #-254]! ; CHECK-NEXT: bx lr @@ -2610,9 +2610,9 @@ define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) { ; CHECK-LABEL: strhf16_m256: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: sub.w r0, r0, #256 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vldrh.u16 q1, [r2] ; CHECK-NEXT: vpt.i16 ne, q1, zr ; CHECK-NEXT: vstrht.16 q0, [r0] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -648,14 +648,14 @@ ; CHECK-NEXT: add.w r9, r3, r5 ; CHECK-NEXT: vldrw.u32 q5, [r4], #16 ; CHECK-NEXT: vldrw.u32 q6, [r3], #16 -; CHECK-NEXT: vfma.f32 q3, q6, q5 ; CHECK-NEXT: add.w r12, r9, r5 +; CHECK-NEXT: vfma.f32 q3, q6, q5 ; CHECK-NEXT: vldrw.u32 q6, [r9] -; CHECK-NEXT: vfma.f32 q4, q6, q5 ; CHECK-NEXT: add.w r6, r12, r5 +; CHECK-NEXT: vfma.f32 q4, q6, q5 ; CHECK-NEXT: vldrw.u32 q6, [r12] -; CHECK-NEXT: vfma.f32 q2, q6, q5 ; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vfma.f32 q2, q6, q5 ; CHECK-NEXT: vldrw.u32 q6, [r6] ; CHECK-NEXT: vfma.f32 q0, q6, q5 ; CHECK-NEXT: vldrw.u32 q6, [r7] @@ -866,17 +866,17 @@ ; CHECK-NEXT: add.w r12, r3, r5 ; CHECK-NEXT: vldrw.u32 q6, [r1], #16 ; CHECK-NEXT: vldrw.u32 q7, [r3], #16 -; CHECK-NEXT: vfma.f32 q4, q7, q6 ; CHECK-NEXT: add.w r10, r12, r5 +; CHECK-NEXT: vfma.f32 q4, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r12] -; CHECK-NEXT: vfma.f32 q5, q7, q6 ; CHECK-NEXT: add.w r6, r10, r5 +; CHECK-NEXT: vfma.f32 q5, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r10] -; CHECK-NEXT: vfma.f32 q2, q7, q6 ; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vfma.f32 q2, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r6] -; CHECK-NEXT: vfma.f32 q0, q7, q6 ; CHECK-NEXT: adds r6, r7, r5 +; CHECK-NEXT: vfma.f32 q0, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r7] ; CHECK-NEXT: vfma.f32 q3, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r6] @@ -1107,47 +1107,49 @@ ; CHECK-NEXT: .LBB6_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r10, r3, r5 ; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vpsttt +; CHECK-NEXT: add.w r10, r3, r5 +; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 -; CHECK-NEXT: vfmat.f32 q5, q0, q7 ; CHECK-NEXT: add.w r11, r10, r5 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpstt +; CHECK-NEXT: vfmat.f32 q5, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r10] -; CHECK-NEXT: vfmat.f32 q6, q0, q7 -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill +; CHECK-NEXT: add.w r6, r11, r5 ; CHECK-NEXT: vpstt +; CHECK-NEXT: vfmat.f32 q6, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r11] -; CHECK-NEXT: vfmat.f32 q1, q0, q7 -; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill ; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vpst +; CHECK-NEXT: vfmat.f32 q1, q0, q7 ; CHECK-NEXT: vmov q5, q4 ; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r6] -; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q1, q0, q7 ; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vfmat.f32 q1, q0, q7 +; CHECK-NEXT: vldrwt.u32 q0, [r7] +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill ; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r7] ; CHECK-NEXT: vfmat.f32 q3, q0, q7 -; CHECK-NEXT: adds r7, r6, r5 -; CHECK-NEXT: vpstttt ; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: vmov q4, q5 +; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q4, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r7] +; CHECK-NEXT: vmov q5, q6 +; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload +; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q0, q7 ; CHECK-NEXT: le lr, .LBB6_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block @@ -1396,54 +1398,55 @@ ; CHECK-NEXT: .LBB7_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r11, r3, r5 ; CHECK-NEXT: vctp.32 r10 -; CHECK-NEXT: vpsttt +; CHECK-NEXT: add.w r11, r3, r5 +; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 -; CHECK-NEXT: vfmat.f32 q6, q1, q0 -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill +; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: sub.w r10, r10, #4 ; CHECK-NEXT: vpstt +; CHECK-NEXT: vfmat.f32 q6, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r11] -; CHECK-NEXT: vfmat.f32 q7, q1, q0 -; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill ; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vpst +; CHECK-NEXT: vfmat.f32 q7, q1, q0 ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vmov q3, q4 +; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q1, [r6] -; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill ; CHECK-NEXT: adds r7, r6, r5 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt +; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r7] +; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload ; CHECK-NEXT: adds r6, r7, r5 -; CHECK-NEXT: vpst +; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q2, q1, q0 +; CHECK-NEXT: vldrwt.u32 q1, [r6] +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: vmov q2, q4 ; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: vmov q3, q5 -; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload -; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r6] ; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: sub.w r10, r10, #4 -; CHECK-NEXT: adds r6, r7, r5 -; CHECK-NEXT: vpstttt ; CHECK-NEXT: vldrwt.u32 q1, [r7] +; CHECK-NEXT: adds r6, r7, r5 +; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q4, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r6] -; CHECK-NEXT: vfmat.f32 q5, q1, q0 +; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: add r6, r5 ; CHECK-NEXT: vpstt +; CHECK-NEXT: vfmat.f32 q5, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r6] +; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload +; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q3, q1, q0 ; CHECK-NEXT: le lr, .LBB7_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll b/llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-constfold.ll @@ -13,8 +13,8 @@ ; CHECK-NEXT: vaddve.s16 r2, q1 ; CHECK-NEXT: vaddvt.s16 r4, q0 ; CHECK-NEXT: vaddve.s16 r6, q0 -; CHECK-NEXT: strd r6, r4, [r0] ; CHECK-NEXT: strd r2, r12, [r0, #8] +; CHECK-NEXT: strd r6, r4, [r0] ; CHECK-NEXT: pop {r4, r6, r7, pc} entry: %0 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 13107) @@ -164,10 +164,10 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: movs r1, #1 ; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: movw r1, #65533 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vaddvat.s32 r0, q0 ; CHECK-NEXT: vaddvat.s32 r0, q1 -; CHECK-NEXT: movw r1, #65533 ; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vaddvat.s32 r0, q0 @@ -242,8 +242,8 @@ ; CHECK-NEXT: vpste ; CHECK-NEXT: vaddvat.s32 r0, q0 ; CHECK-NEXT: vaddvae.s32 r0, q1 -; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vpnot +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vpste ; CHECK-NEXT: vaddvat.s32 r0, q1 ; CHECK-NEXT: vaddvae.s32 r0, q0 @@ -272,8 +272,8 @@ ; CHECK-NEXT: vpstt ; CHECK-NEXT: vaddvat.s32 r0, q0 ; CHECK-NEXT: vaddvat.s32 r0, q1 -; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vpnot +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vaddvat.s32 r0, q1 ; CHECK-NEXT: vaddvat.s32 r0, q0 @@ -299,14 +299,14 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: movw r1, #1234 ; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: movw r1, #64300 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vaddvat.s32 r0, q0 ; CHECK-NEXT: vaddvat.s32 r0, q1 -; CHECK-NEXT: movw r1, #64300 ; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: movw r1, #64301 ; CHECK-NEXT: vpst ; CHECK-NEXT: vaddvat.s32 r0, q1 -; CHECK-NEXT: movw r1, #64301 ; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vaddvat.s32 r0, q1 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -20,8 +20,8 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16 ; CHECK-NEXT: vptt.f32 ge, q1, q4 -; CHECK-NEXT: vmovt q1, q4 ; CHECK-NEXT: vmovt q0, q2 +; CHECK-NEXT: vmovt q1, q4 ; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -520,9 +520,8 @@ ; CHECK-NEXT: vdup.32 q4, r9 ; CHECK-NEXT: add.w r9, r9, #4 ; CHECK-NEXT: vorr q4, q4, q0 -; CHECK-NEXT: vpt.u32 cs, q1, q4 +; CHECK-NEXT: vptt.u32 cs, q1, q4 ; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 -; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q5, [r1], #16 ; CHECK-NEXT: vmov.f32 s24, s18 ; CHECK-NEXT: vmov.f32 s26, s19 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -314,10 +314,10 @@ ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [r0], #16 ; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q5, q4, q7 +; CHECK-NEXT: vldrw.u32 q4, [r0], #16 ; CHECK-NEXT: vadd.i32 q6, q4, q6 +; CHECK-NEXT: vadd.i32 q5, q4, q7 ; CHECK-NEXT: vstrw.32 q6, [q3, #128]! ; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vadd.i32 q6, q4, q6 diff --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll --- a/llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll +++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-loopinvariant.ll @@ -21,8 +21,8 @@ ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds r0, r1, r2 -; CHECK-NEXT: adds r2, #16 ; CHECK-NEXT: vidup.u8 q0, r0, #1 +; CHECK-NEXT: adds r2, #16 ; CHECK-NEXT: vstrb.8 q0, [r3], #16 ; CHECK-NEXT: letp lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %while.end diff --git a/llvm/test/CodeGen/Thumb2/mve-vctp.ll b/llvm/test/CodeGen/Thumb2/mve-vctp.ll --- a/llvm/test/CodeGen/Thumb2/mve-vctp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vctp.ll @@ -4,9 +4,9 @@ define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) { ; CHECK-LABEL: vctp8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vctp.8 r0 ; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vctp.8 r0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vmovt q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r2] @@ -21,9 +21,9 @@ define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) { ; CHECK-LABEL: vctp16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vctp.16 r0 ; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vctp.16 r0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vmovt q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r2] @@ -38,9 +38,9 @@ define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) { ; CHECK-LABEL: vctp32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vctp.32 r0 ; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vmovt q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r2] diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -139,8 +139,8 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %b) { ; CHECK-LABEL: add_v4i16_v4i32_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q1, zr ; CHECK-NEXT: vaddvt.u32 r0, q0 ; CHECK-NEXT: bx lr @@ -155,8 +155,8 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %b) { ; CHECK-LABEL: add_v4i16_v4i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q1, zr ; CHECK-NEXT: vaddvt.u32 r0, q0 ; CHECK-NEXT: bx lr @@ -393,8 +393,8 @@ define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %b) { ; CHECK-LABEL: add_v4i16_v4i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q1, zr ; CHECK-NEXT: vaddlvt.u32 r0, r1, q0 ; CHECK-NEXT: bx lr @@ -409,8 +409,8 @@ define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %b) { ; CHECK-LABEL: add_v4i16_v4i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q1, zr ; CHECK-NEXT: vaddlvt.s32 r0, r1, q0 ; CHECK-NEXT: bx lr @@ -524,8 +524,8 @@ define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i32_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vpt.i16 eq, q1, zr ; CHECK-NEXT: vaddvt.u16 r0, q0 ; CHECK-NEXT: bx lr @@ -540,8 +540,8 @@ define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vpt.i16 eq, q1, zr ; CHECK-NEXT: vaddvt.s16 r0, q0 ; CHECK-NEXT: bx lr @@ -557,8 +557,8 @@ ; CHECK-LABEL: add_v4i8_v4i32_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vpt.i32 eq, q1, zr ; CHECK-NEXT: vaddvt.u32 r0, q0 ; CHECK-NEXT: bx lr @@ -573,8 +573,8 @@ define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %b) { ; CHECK-LABEL: add_v4i8_v4i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmov.i32 q2, #0xff +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q1, zr @@ -621,8 +621,8 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i16_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vpt.i16 eq, q1, zr ; CHECK-NEXT: vaddvt.u16 r0, q0 ; CHECK-NEXT: uxth r0, r0 @@ -638,8 +638,8 @@ define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i16_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vpt.i16 eq, q1, zr ; CHECK-NEXT: vaddvt.u16 r0, q0 ; CHECK-NEXT: sxth r0, r0 @@ -1350,8 +1350,8 @@ ; CHECK-LABEL: add_v4i8_v4i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vpt.i32 eq, q1, zr ; CHECK-NEXT: vaddlvt.u32 r0, r1, q0 ; CHECK-NEXT: bx lr @@ -1366,8 +1366,8 @@ define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %b) { ; CHECK-LABEL: add_v4i8_v4i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmov.i32 q2, #0xff +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q1, zr @@ -1633,8 +1633,8 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %b, i32 %a) { ; CHECK-LABEL: add_v4i16_v4i32_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q1, zr ; CHECK-NEXT: vaddvat.u32 r0, q0 ; CHECK-NEXT: bx lr @@ -1650,8 +1650,8 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %b, i32 %a) { ; CHECK-LABEL: add_v4i16_v4i32_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q1, zr ; CHECK-NEXT: vaddvat.u32 r0, q0 ; CHECK-NEXT: bx lr @@ -2012,8 +2012,8 @@ ; CHECK-LABEL: add_v4i8_v4i32_acc_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vpt.i32 eq, q1, zr ; CHECK-NEXT: vaddvat.u32 r0, q0 ; CHECK-NEXT: bx lr @@ -2029,8 +2029,8 @@ define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %b, i32 %a) { ; CHECK-LABEL: add_v4i8_v4i32_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmov.i32 q2, #0xff +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q1, zr @@ -2080,8 +2080,8 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %b, i16 %a) { ; CHECK-LABEL: add_v8i8_v8i16_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vpt.i16 eq, q1, zr ; CHECK-NEXT: vaddvat.u16 r0, q0 ; CHECK-NEXT: uxth r0, r0 @@ -2098,8 +2098,8 @@ define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %b, i16 %a) { ; CHECK-LABEL: add_v8i8_v8i16_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vpt.i16 eq, q1, zr ; CHECK-NEXT: vaddvat.u16 r0, q0 ; CHECK-NEXT: sxth r0, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -146,9 +146,9 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) { ; CHECK-LABEL: add_v4i16_v4i32_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vmovlb.u16 q1, q1 ; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlavt.u32 r0, q0, q1 ; CHECK-NEXT: bx lr @@ -165,9 +165,9 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) { ; CHECK-LABEL: add_v4i16_v4i32_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlavt.u32 r0, q0, q1 ; CHECK-NEXT: bx lr @@ -315,9 +315,9 @@ define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) { ; CHECK-LABEL: add_v4i16_v4i64_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vmovlb.u16 q1, q1 ; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 ; CHECK-NEXT: bx lr @@ -334,9 +334,9 @@ define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) { ; CHECK-LABEL: add_v4i16_v4i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 ; CHECK-NEXT: bx lr @@ -523,9 +523,9 @@ define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i32_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vmovlb.u8 q1, q1 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vpt.i16 eq, q2, zr ; CHECK-NEXT: vmlavt.u16 r0, q0, q1 ; CHECK-NEXT: bx lr @@ -542,9 +542,9 @@ define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i32_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vpt.i16 eq, q2, zr ; CHECK-NEXT: vmlavt.s16 r0, q0, q1 ; CHECK-NEXT: bx lr @@ -561,8 +561,8 @@ define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_zext(<8 x i8> %x, <8 x i16> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8i16_v8i32_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmovlb.u8 q2, q2 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vpt.i16 eq, q2, zr ; CHECK-NEXT: vmlavt.u16 r0, q0, q1 ; CHECK-NEXT: bx lr @@ -579,8 +579,8 @@ define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_sext(<8 x i8> %x, <8 x i16> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8i16_v8i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovlb.u8 q2, q2 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vpt.i16 eq, q2, zr ; CHECK-NEXT: vmlavt.s16 r0, q0, q1 ; CHECK-NEXT: bx lr @@ -598,9 +598,9 @@ ; CHECK-LABEL: add_v4i8_v4i32_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vand q1, q1, q3 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlavt.u32 r0, q0, q1 ; CHECK-NEXT: bx lr @@ -617,11 +617,11 @@ define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) { ; CHECK-LABEL: add_v4i8_v4i32_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q3, #0xff ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmov.i32 q3, #0xff -; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlavt.u32 r0, q0, q1 @@ -641,8 +641,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q3, #0xff ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vand q1, q1, q3 ; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vand q1, q1, q3 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlavt.u32 r0, q0, q1 @@ -762,9 +762,9 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i16_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vmovlb.u8 q1, q1 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vpt.i16 eq, q2, zr ; CHECK-NEXT: vmlavt.u16 r0, q0, q1 ; CHECK-NEXT: uxth r0, r0 @@ -782,9 +782,9 @@ define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i16_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vpt.i16 eq, q2, zr ; CHECK-NEXT: vmlavt.u16 r0, q0, q1 ; CHECK-NEXT: sxth r0, r0 @@ -1406,9 +1406,9 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i64_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vmovlb.u8 q1, q1 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vpt.i16 eq, q2, zr ; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1 ; CHECK-NEXT: bx lr @@ -1425,9 +1425,9 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vpt.i16 eq, q2, zr ; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1 ; CHECK-NEXT: bx lr @@ -1445,9 +1445,9 @@ ; CHECK-LABEL: add_v4i8_v4i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vand q1, q1, q3 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 ; CHECK-NEXT: bx lr @@ -1464,11 +1464,11 @@ define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) { ; CHECK-LABEL: add_v4i8_v4i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q3, #0xff ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmov.i32 q3, #0xff -; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 @@ -1488,8 +1488,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q3, #0xff ; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 ; CHECK-NEXT: bx lr @@ -1506,8 +1506,8 @@ define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) { ; CHECK-LABEL: add_v4i8i16_v4i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 @@ -1529,8 +1529,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q3, #0xff ; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 ; CHECK-NEXT: bx lr @@ -1548,8 +1548,8 @@ define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) { ; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 @@ -1858,9 +1858,9 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) { ; CHECK-LABEL: add_v4i16_v4i32_acc_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vmovlb.u16 q1, q1 ; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlavat.u32 r0, q0, q1 ; CHECK-NEXT: bx lr @@ -1878,9 +1878,9 @@ define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) { ; CHECK-LABEL: add_v4i16_v4i32_acc_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlavat.u32 r0, q0, q1 ; CHECK-NEXT: bx lr @@ -2187,9 +2187,9 @@ ; CHECK-LABEL: add_v4i8_v4i32_acc_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q3, #0xff +; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vand q1, q1, q3 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlavat.u32 r0, q0, q1 ; CHECK-NEXT: bx lr @@ -2207,11 +2207,11 @@ define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) { ; CHECK-LABEL: add_v4i8_v4i32_acc_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q3, #0xff ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmov.i32 q3, #0xff -; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vpt.i32 eq, q2, zr ; CHECK-NEXT: vmlavat.u32 r0, q0, q1 @@ -2266,9 +2266,9 @@ define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) { ; CHECK-LABEL: add_v8i8_v8i16_acc_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vmovlb.u8 q1, q1 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vpt.i16 eq, q2, zr ; CHECK-NEXT: vmlavat.u16 r0, q0, q1 ; CHECK-NEXT: uxth r0, r0 @@ -2287,9 +2287,9 @@ define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) { ; CHECK-LABEL: add_v8i8_v8i16_acc_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vpt.i16 eq, q2, zr ; CHECK-NEXT: vmlavat.u16 r0, q0, q1 ; CHECK-NEXT: sxth r0, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-tailpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-tailpred.ll @@ -117,8 +117,8 @@ ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 -; CHECK-NEXT: vabs.f16 q0, q0 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 +; CHECK-NEXT: vabs.f16 q0, q0 ; CHECK-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-NEXT: vstrh.16 q0, [r2], #16 ; CHECK-NEXT: letp lr, .LBB2_2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll b/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll @@ -166,10 +166,10 @@ ; CHECK-NEXT: vsub.i16 q3, q0, q1 ; CHECK-NEXT: vmovlb.u8 q2, q2 ; CHECK-NEXT: vmul.i16 q3, q2, q3 +; CHECK-NEXT: vldrb.u16 q2, [r0], #8 ; CHECK-NEXT: vmla.u16 q3, q1, r3 -; CHECK-NEXT: vshr.u16 q3, q3, #8 ; CHECK-NEXT: vldrb.u16 q1, [r2], #8 -; CHECK-NEXT: vldrb.u16 q2, [r0], #8 +; CHECK-NEXT: vshr.u16 q3, q3, #8 ; CHECK-NEXT: vstrb.16 q3, [r0, #-16] ; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %do.end diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll @@ -54,9 +54,9 @@ ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov.i64 q5, #0xffffffff ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q4, [r0] -; CHECK-NEXT: vmov.i64 q5, #0xffffffff ; CHECK-NEXT: vmov.f32 s0, s16 ; CHECK-NEXT: vmov.f32 s2, s17 ; CHECK-NEXT: vand q6, q0, q5