Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -16448,6 +16448,19 @@ switch (II->getIntrinsicID()) { case Intrinsic::fma: return !IsFMS(I); + case Intrinsic::arm_mve_add_predicated: + case Intrinsic::arm_mve_mul_predicated: + case Intrinsic::arm_mve_qadd_predicated: + case Intrinsic::arm_mve_hadd_predicated: + case Intrinsic::arm_mve_vqdmull_predicated: + case Intrinsic::arm_mve_qdmulh_predicated: + case Intrinsic::arm_mve_qrdmulh_predicated: + case Intrinsic::arm_mve_fma_predicated: + return true; + case Intrinsic::arm_mve_sub_predicated: + case Intrinsic::arm_mve_qsub_predicated: + case Intrinsic::arm_mve_hsub_predicated: + return Operand == 1; default: return false; } Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -17,19 +17,18 @@ ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit ; CHECK-NEXT: vmov s4, r1 -; CHECK-NEXT: dlstp.32 lr, r1 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: vadd.f32 s0, s3, s3 ; CHECK-NEXT: vcvt.f32.u32 s4, s4 ; CHECK-NEXT: vdiv.f32 s0, s0, s4 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vdup.32 q1, r3 -; CHECK-NEXT: mov r3, r1 ; CHECK-NEXT: .LBB0_3: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vsub.f32 q2, q2, q1 -; CHECK-NEXT: vfma.f32 q0, q2, q2 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vsub.f32 q1, q1, r12 +; CHECK-NEXT: vfma.f32 q0, q1, q1 ; CHECK-NEXT: letp lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %do.end ; CHECK-NEXT: subs r0, r1, #1 Index: llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -683,84 +683,86 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #20 -; CHECK-NEXT: sub sp, #20 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: add.w r12, sp, #12 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill ; CHECK-NEXT: bne .LBB5_8 ; CHECK-NEXT: @ %bb.1: @ %for.cond.preheader -; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r7, [sp, #84] -; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r0, [sp, #68] -; CHECK-NEXT: add.w r1, r3, r7, lsl #1 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: adds r1, r3, r7 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r1, r7, r7, lsl #1 -; CHECK-NEXT: vdup.16 q0, r0 -; CHECK-NEXT: adds r0, r3, r1 +; CHECK-NEXT: ldr r2, [sp, #88] +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r4, [sp, #72] +; CHECK-NEXT: add.w r0, r1, r2, lsl #1 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: adds r0, r1, r2 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r2, r2, lsl #1 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: adds r0, r7, #7 -; CHECK-NEXT: lsr.w r9, r0, #3 +; CHECK-NEXT: adds r0, r2, #7 +; CHECK-NEXT: lsrs r2, r0, #3 ; CHECK-NEXT: b .LBB5_5 ; CHECK-NEXT: .LBB5_3: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB5_4: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #92] -; CHECK-NEXT: add.w r0, r8, r10 +; CHECK-NEXT: add.w r0, r10, r8 +; CHECK-NEXT: ldr r1, [sp, #96] ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: strb.w r0, [r1, r11] -; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: cmp r11, r2 +; CHECK-NEXT: strb.w r0, [r1, r9] +; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: cmp r9, r0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: .LBB5_5: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_7 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #88] -; CHECK-NEXT: subs.w lr, r9, r9 -; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] +; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: subs.w lr, r2, r2 +; CHECK-NEXT: ldr.w r12, [r0, r9, lsl #2] ; CHECK-NEXT: ble .LBB5_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #84] +; CHECK-NEXT: ldr.w r11, [sp, #88] ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: dlstp.16 lr, r3 +; CHECK-NEXT: dlstp.16 lr, r11 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mla r5, r11, r3, r0 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldrd r4, r7, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mla r3, r9, r11, r0 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: .LBB5_7: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB5_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrb.s16 q1, [r4], #8 -; CHECK-NEXT: vadd.i16 q2, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r7], #8 +; CHECK-NEXT: vadd.i16 q1, q0, r4 +; CHECK-NEXT: vldrb.s16 q0, [r3], #8 +; CHECK-NEXT: vmlava.s16 r12, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vmlava.s16 r12, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r0], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r6, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r7], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r8, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r1], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r10, q1, q2 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r6, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r0], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r10, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r1], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: letp lr, .LBB5_7 ; CHECK-NEXT: b .LBB5_4 ; CHECK-NEXT: .LBB5_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #92] -; CHECK-NEXT: add sp, #20 +; CHECK-NEXT: ldr r0, [sp, #96] +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 @@ -869,83 +871,85 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #20 -; CHECK-NEXT: sub sp, #20 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: add.w r12, sp, #12 ; CHECK-NEXT: cmp r3, #4 -; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill ; CHECK-NEXT: bne .LBB6_8 ; CHECK-NEXT: @ %bb.1: @ %for.cond.preheader -; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph -; CHECK-NEXT: ldr r7, [sp, #84] -; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r0, [sp, #68] -; CHECK-NEXT: add.w r1, r3, r7, lsl #1 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: adds r1, r3, r7 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r1, r7, r7, lsl #1 -; CHECK-NEXT: vdup.16 q0, r0 -; CHECK-NEXT: adds r0, r3, r1 +; CHECK-NEXT: ldr r2, [sp, #88] +; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r4, [sp, #72] +; CHECK-NEXT: add.w r0, r1, r2, lsl #1 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: adds r0, r1, r2 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r2, r2, lsl #1 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: adds r0, r7, #7 -; CHECK-NEXT: lsr.w r9, r0, #3 +; CHECK-NEXT: adds r0, r2, #7 +; CHECK-NEXT: lsrs r2, r0, #3 ; CHECK-NEXT: .LBB6_3: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_5 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #88] -; CHECK-NEXT: subs.w lr, r9, r9 -; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] +; CHECK-NEXT: ldr r0, [sp, #92] +; CHECK-NEXT: subs.w lr, r2, r2 +; CHECK-NEXT: ldr.w r12, [r0, r9, lsl #2] ; CHECK-NEXT: ble .LBB6_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #84] +; CHECK-NEXT: ldr.w r11, [sp, #88] ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: dlstp.16 lr, r3 +; CHECK-NEXT: dlstp.16 lr, r11 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: mla r5, r11, r3, r0 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldrd r4, r7, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 +; CHECK-NEXT: mla r3, r9, r11, r0 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r8, r12 ; CHECK-NEXT: .LBB6_5: @ %for.body24 ; CHECK-NEXT: @ Parent Loop BB6_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrb.s16 q1, [r4], #8 -; CHECK-NEXT: vadd.i16 q2, q1, q0 +; CHECK-NEXT: vldrb.s16 q0, [r7], #8 +; CHECK-NEXT: vadd.i16 q1, q0, r4 +; CHECK-NEXT: vldrb.s16 q0, [r3], #8 +; CHECK-NEXT: vmlava.s16 r12, q0, q1 ; CHECK-NEXT: vldrb.s16 q1, [r5], #8 -; CHECK-NEXT: vmlava.s16 r12, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r0], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r6, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r7], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r8, q1, q2 -; CHECK-NEXT: vldrb.s16 q2, [r1], #8 -; CHECK-NEXT: vadd.i16 q2, q2, q0 -; CHECK-NEXT: vmlava.s16 r10, q1, q2 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r6, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r0], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r10, q0, q1 +; CHECK-NEXT: vldrb.s16 q1, [r1], #8 +; CHECK-NEXT: vadd.i16 q1, q1, r4 +; CHECK-NEXT: vmlava.s16 r8, q0, q1 ; CHECK-NEXT: letp lr, .LBB6_5 ; CHECK-NEXT: b .LBB6_7 ; CHECK-NEXT: .LBB6_6: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mov r10, r12 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #92] -; CHECK-NEXT: add.w r0, r8, r10 +; CHECK-NEXT: add.w r0, r10, r8 +; CHECK-NEXT: ldr r1, [sp, #96] ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: strb.w r0, [r1, r11] -; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: cmp r11, r2 +; CHECK-NEXT: strb.w r0, [r1, r9] +; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: cmp r9, r0 ; CHECK-NEXT: bne .LBB6_3 ; CHECK-NEXT: .LBB6_8: @ %if.end -; CHECK-NEXT: ldr r0, [sp, #92] -; CHECK-NEXT: add sp, #20 +; CHECK-NEXT: ldr r0, [sp, #96] +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp = icmp eq i16 %num_cols, 4 Index: llvm/test/CodeGen/Thumb2/mve-qrintr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-qrintr.ll +++ llvm/test/CodeGen/Thumb2/mve-qrintr.ll @@ -10,13 +10,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB0_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vadd.i32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -55,13 +54,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB1_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB1_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vsub.i32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vsub.i32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -100,13 +98,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB2_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmul.i32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmul.i32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -145,13 +142,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB3_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vqadd.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqadd.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -190,13 +186,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB4_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vqsub.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqsub.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB4_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -235,13 +230,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB5_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB5_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vhadd.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vhadd.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -280,13 +274,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB6_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB6_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vhsub.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vhsub.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB6_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -325,13 +318,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB7_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB7_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.s32 q1, [r0] -; CHECK-NEXT: vqdmullb.s16 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vqdmullb.s16 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB7_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -374,13 +366,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB8_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB8_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vqdmulh.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqdmulh.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB8_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -419,13 +410,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB9_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB9_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vqrdmulh.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqrdmulh.s32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB9_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -464,13 +454,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB10_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB10_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vadd.f32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vadd.f32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB10_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -509,13 +498,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB11_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB11_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vsub.f32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vsub.f32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB11_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -554,13 +542,12 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB12_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB12_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmul.f32 q0, q0, r1 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB12_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -599,14 +586,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB13_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r2 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB13_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vfma.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vfma.f32 q1, q0, r2 +; CHECK-NEXT: vstrw.32 q1, [r0], #16 ; CHECK-NEXT: letp lr, .LBB13_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -647,15 +633,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB14_1: @ %while.body.lr.ph -; CHECK-NEXT: vdup.32 q0, r2 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB14_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vstrw.32 q3, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vfmas.f32 q1, q0, r2 +; CHECK-NEXT: vstrw.32 q1, [r0], #16 ; CHECK-NEXT: letp lr, .LBB14_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc}