diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -7732,6 +7732,46 @@ DAG.getConstant(N, DL, MVT::i32)); } +// Returns true if the operation N can be treated as qr instruction variant at +// operand Op. +static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) { + switch (N->getOpcode()) { + case ISD::ADD: + case ISD::MUL: + case ISD::SADDSAT: + case ISD::UADDSAT: + return true; + case ISD::SUB: + case ISD::SSUBSAT: + case ISD::USUBSAT: + return N->getOperand(1).getNode() == Op; + case ISD::INTRINSIC_WO_CHAIN: + switch (N->getConstantOperandVal(0)) { + case Intrinsic::arm_mve_add_predicated: + case Intrinsic::arm_mve_mul_predicated: + case Intrinsic::arm_mve_qadd_predicated: + case Intrinsic::arm_mve_vhadd: + case Intrinsic::arm_mve_hadd_predicated: + case Intrinsic::arm_mve_vqdmulh: + case Intrinsic::arm_mve_qdmulh_predicated: + case Intrinsic::arm_mve_vqrdmulh: + case Intrinsic::arm_mve_qrdmulh_predicated: + case Intrinsic::arm_mve_vqdmull: + case Intrinsic::arm_mve_vqdmull_predicated: + return true; + case Intrinsic::arm_mve_sub_predicated: + case Intrinsic::arm_mve_qsub_predicated: + case Intrinsic::arm_mve_vhsub: + case Intrinsic::arm_mve_hsub_predicated: + return N->getOperand(2).getNode() == Op; + default: + return false; + } + default: + return false; + } +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, @@ -7753,6 +7793,20 @@ if (SplatUndef.isAllOnes()) return DAG.getUNDEF(VT); + // If all the users of this constant splat are qr instruction variants, + // generate a vdup of the constant. + if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize && + (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) && + all_of(BVN->uses(), + [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) { + EVT DupVT = SplatBitSize == 32 ? MVT::v4i32 + : SplatBitSize == 16 ? MVT::v8i16 + : MVT::v16i8; + SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32); + SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const); + return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup); + } + if ((ST->hasNEON() && SplatBitSize <= 64) || (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) { // Check if an immediate VMOV works. diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2253,6 +2253,31 @@ (v4i32 (ARMvmovImm (i32 1)))), (i32 1))), (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>; + + def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>; } @@ -5348,33 +5373,40 @@ } multiclass MVE_VHADDSUB_qr_m { + Intrinsic unpred_int, Intrinsic pred_int, PatFrag add_op, + SDNode shift_op> { def "" : MVE_VxADDSUB_qr; defm : MVE_vec_scalar_int_pat_m(NAME), VTI, unpred_int, pred_int, 1, 1>; + defvar Inst = !cast(NAME); + + let Predicates = [HasMVEInt] in { + def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn))), (i32 1))), + (Inst MQPR:$Qm, rGPR:$Rn)>; + } } -multiclass MVE_VHADD_qr_m : - MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd, - int_arm_mve_hadd_predicated>; +multiclass MVE_VHADD_qr_m : + MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, + add_op, shift_op>; -multiclass MVE_VHSUB_qr_m : - MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub, - int_arm_mve_hsub_predicated>; +multiclass MVE_VHSUB_qr_m : + MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub, int_arm_mve_hsub_predicated, + add_op, shift_op>; -defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m; -defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m; +defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m; -defm MVE_VHSUB_qr_s8 : MVE_VHSUB_qr_m; -defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m; -defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m; -defm MVE_VHSUB_qr_u8 : MVE_VHSUB_qr_m; -defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m; -defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m; +defm MVE_VHSUB_qr_s8 : MVE_VHSUB_qr_m; +defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m; +defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m; +defm MVE_VHSUB_qr_u8 : MVE_VHSUB_qr_m; +defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m; +defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m; multiclass MVE_VADDSUB_qr_f { diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/exitcount.ll @@ -19,15 +19,15 @@ ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: ldr r2, [r0] ; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmov.i16 q0, #0x1800 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 +; CHECK-NEXT: mov.w r3, #6144 ; CHECK-NEXT: dlstp.16 lr, r2 ; CHECK-NEXT: .LBB0_3: @ %do.body6 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r1], #16 -; CHECK-NEXT: vcvt.f16.s16 q1, q1 -; CHECK-NEXT: vmul.f16 q1, q1, q0 -; CHECK-NEXT: vstrh.16 q1, [r0], #16 +; CHECK-NEXT: vldrh.u16 q0, [r1], #16 +; CHECK-NEXT: vcvt.f16.s16 q0, q0 +; CHECK-NEXT: vmul.f16 q0, q0, r3 +; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %do.end13 ; CHECK-NEXT: pop {r4, pc} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll @@ -10,17 +10,17 @@ define arm_aapcs_vfpcc float @vctpi32(float* %0, i32 %1) { ; CHECK-LABEL: vctpi32: ; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmvn.i32 q1, #0x1f -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: subs r3, r1, #1 -; CHECK-NEXT: vadd.i32 q1, q3, q1 -; CHECK-NEXT: vidup.u32 q2, r2, #8 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vadd.i32 q1, q2, r0 +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: mvn r3, #31 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: subs r2, r1, #1 +; CHECK-NEXT: vadd.i32 q2, q2, r3 +; CHECK-NEXT: vidup.u32 q1, r4, #8 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_1: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q2, [q1, #32]! ; CHECK-NEXT: vadd.f32 q0, q0, q2 @@ -30,7 +30,7 @@ ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvt.f32.s32 s0, s0 ; CHECK-NEXT: vabs.f32 s0, s0 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) %4 = extractvalue { <4 x i32>, i32 } %3, 0 %5 = add nsw i32 %1, -1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll @@ -5,39 +5,39 @@ ; CHECK-LABEL: remat_vctp: ; CHECK: @ %bb.0: @ %bb ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: ldrd r5, r12, [sp, #80] +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: ldrd r5, r12, [sp, #64] ; CHECK-NEXT: vmvn.i32 q0, #0x80000000 ; CHECK-NEXT: vmov.i32 q1, #0x3f -; CHECK-NEXT: vmov.i32 q2, #0x1 +; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB0_1: @ %bb6 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [r1], #16 -; CHECK-NEXT: vabs.s32 q5, q4 -; CHECK-NEXT: vcls.s32 q3, q5 -; CHECK-NEXT: vshl.u32 q5, q5, q3 -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vshr.u32 q6, q5, #24 -; CHECK-NEXT: vand q6, q6, q1 -; CHECK-NEXT: vldrw.u32 q7, [r5, q6, uxtw #2] -; CHECK-NEXT: vqrdmulh.s32 q6, q7, q5 -; CHECK-NEXT: vqsub.s32 q6, q0, q6 -; CHECK-NEXT: vqrdmulh.s32 q6, q7, q6 -; CHECK-NEXT: vqshl.s32 q6, q6, #1 -; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 +; CHECK-NEXT: vldrw.u32 q3, [r1], #16 +; CHECK-NEXT: vabs.s32 q4, q3 +; CHECK-NEXT: vcls.s32 q2, q4 +; CHECK-NEXT: vshl.u32 q4, q4, q2 +; CHECK-NEXT: vadd.i32 q2, q2, r4 +; CHECK-NEXT: vshr.u32 q5, q4, #24 +; CHECK-NEXT: vand q5, q5, q1 +; CHECK-NEXT: vldrw.u32 q6, [r5, q5, uxtw #2] +; CHECK-NEXT: vqrdmulh.s32 q5, q6, q4 ; CHECK-NEXT: vqsub.s32 q5, q0, q5 ; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 ; CHECK-NEXT: vqshl.s32 q5, q5, #1 -; CHECK-NEXT: vpt.s32 lt, q4, zr -; CHECK-NEXT: vnegt.s32 q5, q5 -; CHECK-NEXT: vldrw.u32 q4, [r0], #16 -; CHECK-NEXT: vqrdmulh.s32 q4, q4, q5 -; CHECK-NEXT: vstrw.32 q4, [r2], #16 -; CHECK-NEXT: vstrw.32 q3, [r3], #16 +; CHECK-NEXT: vqrdmulh.s32 q4, q5, q4 +; CHECK-NEXT: vqsub.s32 q4, q0, q4 +; CHECK-NEXT: vqrdmulh.s32 q4, q5, q4 +; CHECK-NEXT: vqshl.s32 q4, q4, #1 +; CHECK-NEXT: vpt.s32 lt, q3, zr +; CHECK-NEXT: vnegt.s32 q4, q4 +; CHECK-NEXT: vldrw.u32 q3, [r0], #16 +; CHECK-NEXT: vqrdmulh.s32 q3, q3, q4 +; CHECK-NEXT: vstrw.32 q3, [r2], #16 +; CHECK-NEXT: vstrw.32 q2, [r3], #16 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %bb44 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r7, pc} bb: %i = zext i16 %arg5 to i32 @@ -93,16 +93,15 @@ ; CHECK-LABEL: dont_remat_predicated_vctp: ; CHECK: @ %bb.0: @ %bb ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldrd r6, r12, [sp, #88] +; CHECK-NEXT: ldrd r6, r12, [sp, #72] ; CHECK-NEXT: movs r4, #4 ; CHECK-NEXT: cmp.w r12, #4 ; CHECK-NEXT: vmvn.i32 q0, #0x80000000 ; CHECK-NEXT: csel r5, r12, r4, lt ; CHECK-NEXT: vmov.i32 q1, #0x3f ; CHECK-NEXT: sub.w r5, r12, r5 -; CHECK-NEXT: vmov.i32 q2, #0x1 ; CHECK-NEXT: add.w lr, r5, #3 ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: add.w lr, r5, lr, lsr #2 @@ -114,35 +113,35 @@ ; CHECK-NEXT: vctpt.32 r4 ; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q4, [r1], #16 -; CHECK-NEXT: vabs.s32 q5, q4 -; CHECK-NEXT: vcls.s32 q3, q5 -; CHECK-NEXT: vshl.u32 q5, q5, q3 -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vshr.u32 q6, q5, #24 -; CHECK-NEXT: vand q6, q6, q1 -; CHECK-NEXT: vldrw.u32 q7, [r6, q6, uxtw #2] -; CHECK-NEXT: vqrdmulh.s32 q6, q7, q5 -; CHECK-NEXT: vqsub.s32 q6, q0, q6 -; CHECK-NEXT: vqrdmulh.s32 q6, q7, q6 -; CHECK-NEXT: vqshl.s32 q6, q6, #1 -; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 +; CHECK-NEXT: vldrwt.u32 q3, [r1], #16 +; CHECK-NEXT: vabs.s32 q4, q3 +; CHECK-NEXT: vcls.s32 q2, q4 +; CHECK-NEXT: vshl.u32 q4, q4, q2 +; CHECK-NEXT: vadd.i32 q2, q2, r5 +; CHECK-NEXT: vshr.u32 q5, q4, #24 +; CHECK-NEXT: vand q5, q5, q1 +; CHECK-NEXT: vldrw.u32 q6, [r6, q5, uxtw #2] +; CHECK-NEXT: vqrdmulh.s32 q5, q6, q4 ; CHECK-NEXT: vqsub.s32 q5, q0, q5 ; CHECK-NEXT: vqrdmulh.s32 q5, q6, q5 ; CHECK-NEXT: vqshl.s32 q5, q5, #1 -; CHECK-NEXT: vpt.s32 lt, q4, zr -; CHECK-NEXT: vnegt.s32 q5, q5 +; CHECK-NEXT: vqrdmulh.s32 q4, q5, q4 +; CHECK-NEXT: vqsub.s32 q4, q0, q4 +; CHECK-NEXT: vqrdmulh.s32 q4, q5, q4 +; CHECK-NEXT: vqshl.s32 q4, q4, #1 +; CHECK-NEXT: vpt.s32 lt, q3, zr +; CHECK-NEXT: vnegt.s32 q4, q4 ; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 -; CHECK-NEXT: vqrdmulh.s32 q4, q4, q5 +; CHECK-NEXT: vldrwt.u32 q3, [r0], #16 +; CHECK-NEXT: vqrdmulh.s32 q3, q3, q4 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vstrwt.32 q4, [r2], #16 -; CHECK-NEXT: vstrwt.32 q3, [r3], #16 +; CHECK-NEXT: vstrwt.32 q3, [r2], #16 +; CHECK-NEXT: vstrwt.32 q2, [r3], #16 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %bb44 ; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r6, pc} bb: %i = zext i16 %arg5 to i32 diff --git a/llvm/test/CodeGen/Thumb2/mve-be.ll b/llvm/test/CodeGen/Thumb2/mve-be.ll --- a/llvm/test/CodeGen/Thumb2/mve-be.ll +++ b/llvm/test/CodeGen/Thumb2/mve-be.ll @@ -268,9 +268,9 @@ define arm_aapcs_vfpcc <4 x i32> @test(i32* %data) { ; CHECK-LE-LABEL: test: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-LE-NEXT: vmov.i32 q0, #0x1 -; CHECK-LE-NEXT: vadd.i32 q1, q1, q0 +; CHECK-LE-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-LE-NEXT: movs r0, #1 +; CHECK-LE-NEXT: vadd.i32 q1, q0, r0 ; CHECK-LE-NEXT: @APP ; CHECK-LE-NEXT: vmullb.s32 q0, q1, q1 ; CHECK-LE-NEXT: @NO_APP @@ -278,8 +278,9 @@ ; ; CHECK-BE-LABEL: test: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: movs r1, #1 ; CHECK-BE-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-BE-NEXT: vmov.i32 q0, #0x1 +; CHECK-BE-NEXT: vdup.32 q0, r1 ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 ; CHECK-BE-NEXT: vrev32.8 q0, q0 ; CHECK-BE-NEXT: @APP diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll --- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll +++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll @@ -335,66 +335,65 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: subs r1, r0, #1 -; CHECK-NEXT: sbcs r1, r3, #0 +; CHECK-NEXT: sbcs r1, r12, #0 ; CHECK-NEXT: blt.w .LBB1_28 ; CHECK-NEXT: @ %bb.1: @ %for.cond2.preheader.lr.ph -; CHECK-NEXT: movs r7, #1 +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: csel r12, r2, r7, lt -; CHECK-NEXT: mov r9, r2 -; CHECK-NEXT: mov r1, r12 -; CHECK-NEXT: cmp.w r12, #3 +; CHECK-NEXT: csel r7, r2, r3, lt +; CHECK-NEXT: mov r10, r2 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: it ls ; CHECK-NEXT: movls r1, #3 ; CHECK-NEXT: movw r2, #43691 -; CHECK-NEXT: sub.w r1, r1, r12 +; CHECK-NEXT: subs r1, r1, r7 ; CHECK-NEXT: movt r2, #43690 ; CHECK-NEXT: adds r1, #2 -; CHECK-NEXT: ldr r4, [sp, #144] +; CHECK-NEXT: ldr r4, [sp, #120] ; CHECK-NEXT: movw r11, :lower16:c -; CHECK-NEXT: vmov.i32 q7, #0xc +; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: umull r1, r2, r1, r2 ; CHECK-NEXT: movt r11, :upper16:c ; CHECK-NEXT: movs r1, #4 -; CHECK-NEXT: vmov.i32 q4, #0xc ; CHECK-NEXT: @ implicit-def: $r8 -; CHECK-NEXT: @ implicit-def: $r5 -; CHECK-NEXT: @ implicit-def: $r10 -; CHECK-NEXT: strd r3, r0, [sp, #16] @ 8-byte Folded Spill -; CHECK-NEXT: add.w r6, r7, r2, lsr #1 +; CHECK-NEXT: @ implicit-def: $r9 +; CHECK-NEXT: movs r5, #12 +; CHECK-NEXT: strd r12, r0, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: add.w r6, r3, r2, lsr #1 ; CHECK-NEXT: add.w r1, r1, r2, lsr #1 ; CHECK-NEXT: movw r2, #65532 ; CHECK-NEXT: vdup.32 q6, r6 ; CHECK-NEXT: movt r2, #32767 ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w r1, r7, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: add.w r1, r3, r1, lsr #2 +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: adr r1, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adr r1, .LCPI1_1 ; CHECK-NEXT: vldrw.u32 q5, [r1] -; CHECK-NEXT: vadd.i32 q3, q0, r12 -; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q4, q0, r7 +; CHECK-NEXT: @ implicit-def: $r7 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: cmn.w r10, #4 +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: cmn.w r9, #4 ; CHECK-NEXT: it le ; CHECK-NEXT: mvnle r0, #3 ; CHECK-NEXT: movw r2, #18725 ; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: movt r2, #9362 -; CHECK-NEXT: sub.w r1, r0, r10 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: sub.w r1, r0, r9 +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: umull r2, r3, r1, r2 ; CHECK-NEXT: subs r2, r1, r3 ; CHECK-NEXT: add.w r2, r3, r2, lsr #1 @@ -403,14 +402,14 @@ ; CHECK-NEXT: sub.w r2, r3, r2, lsr #2 ; CHECK-NEXT: subs r1, r2, r1 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: add.w r10, r0, #7 -; CHECK-NEXT: ldrd r3, r0, [sp, #16] @ 8-byte Folded Reload +; CHECK-NEXT: add.w r9, r0, #7 +; CHECK-NEXT: ldrd r12, r0, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: adds r5, #2 -; CHECK-NEXT: subs r1, r5, r0 -; CHECK-NEXT: asr.w r2, r5, #31 -; CHECK-NEXT: sbcs.w r1, r2, r3 +; CHECK-NEXT: add.w r8, r8, #2 +; CHECK-NEXT: subs.w r1, r8, r0 +; CHECK-NEXT: asr.w r2, r8, #31 +; CHECK-NEXT: sbcs.w r1, r2, r12 ; CHECK-NEXT: bge.w .LBB1_28 ; CHECK-NEXT: .LBB1_4: @ %for.cond2.preheader ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -418,31 +417,29 @@ ; CHECK-NEXT: @ Child Loop BB1_8 Depth 2 ; CHECK-NEXT: @ Child Loop BB1_10 Depth 3 ; CHECK-NEXT: @ Child Loop BB1_12 Depth 3 -; CHECK-NEXT: cmp.w r10, #2 +; CHECK-NEXT: cmp.w r9, #2 ; CHECK-NEXT: bgt .LBB1_3 ; CHECK-NEXT: @ %bb.5: @ %for.body6.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: cmp.w r12, #5 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #5 ; CHECK-NEXT: bhi .LBB1_15 ; CHECK-NEXT: @ %bb.6: @ %for.body6.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: ldrd r2, r3, [sp, #136] +; CHECK-NEXT: ldrd r2, r3, [sp, #112] ; CHECK-NEXT: movs r0, #32 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: mov r7, r12 ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: ldrd r3, r0, [sp, #16] @ 8-byte Folded Reload -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: mov r12, r7 +; CHECK-NEXT: ldrd r12, r0, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: mov r7, r10 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: b .LBB1_8 ; CHECK-NEXT: .LBB1_7: @ %for.cond.cleanup17.us ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: add.w r10, r7, #7 -; CHECK-NEXT: cmn.w r7, #4 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: mov r7, r10 +; CHECK-NEXT: add.w r9, r3, #7 +; CHECK-NEXT: cmn.w r3, #4 +; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: bge .LBB1_3 ; CHECK-NEXT: .LBB1_8: @ %for.body6.us ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 @@ -450,18 +447,18 @@ ; CHECK-NEXT: @ Child Loop BB1_10 Depth 3 ; CHECK-NEXT: @ Child Loop BB1_12 Depth 3 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: beq .LBB1_11 ; CHECK-NEXT: @ %bb.9: @ %for.body13.us51.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 ; CHECK-NEXT: movw r2, :lower16:a -; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vmov q1, q4 ; CHECK-NEXT: movt r2, :upper16:a ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: movw r2, :lower16:b ; CHECK-NEXT: movt r2, :upper16:b ; CHECK-NEXT: str r1, [r2] -; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: dlstp.32 lr, r6 ; CHECK-NEXT: .LBB1_10: @ %vector.body111 ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 @@ -469,14 +466,14 @@ ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 ; CHECK-NEXT: vshl.i32 q2, q1, #2 ; CHECK-NEXT: vadd.i32 q2, q2, r11 -; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: vadd.i32 q1, q1, r5 ; CHECK-NEXT: vstrw.32 q0, [q2] ; CHECK-NEXT: letp lr, .LBB1_10 ; CHECK-NEXT: b .LBB1_13 ; CHECK-NEXT: .LBB1_11: @ %vector.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vmov q1, q4 ; CHECK-NEXT: .LBB1_12: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_8 Depth=2 @@ -487,7 +484,7 @@ ; CHECK-NEXT: vshl.i32 q2, q1, #2 ; CHECK-NEXT: add.w r1, r1, #4 ; CHECK-NEXT: vadd.i32 q2, q2, r11 -; CHECK-NEXT: vadd.i32 q1, q1, q7 +; CHECK-NEXT: vadd.i32 q1, q1, r5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [q2] ; CHECK-NEXT: bne .LBB1_12 @@ -497,7 +494,7 @@ ; CHECK-NEXT: beq .LBB1_7 ; CHECK-NEXT: @ %bb.14: @ %for.cond9.for.cond15.preheader_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: eor r1, r8, #1 +; CHECK-NEXT: eor r1, r7, #1 ; CHECK-NEXT: lsls r1, r1, #31 ; CHECK-NEXT: bne .LBB1_7 ; CHECK-NEXT: b .LBB1_26 @@ -506,12 +503,12 @@ ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: beq.w .LBB1_2 ; CHECK-NEXT: @ %bb.16: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: ldrd r3, r0, [sp, #16] @ 8-byte Folded Reload -; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: ldrd r12, r0, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: .LBB1_17: @ %for.body6.us60 ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: lsls.w r1, r8, #31 +; CHECK-NEXT: lsls r1, r7, #31 ; CHECK-NEXT: bne .LBB1_27 ; CHECK-NEXT: @ %bb.18: @ %for.cond.cleanup17.us63 ; CHECK-NEXT: @ in Loop: Header=BB1_17 Depth=2 @@ -527,26 +524,26 @@ ; CHECK-NEXT: bgt .LBB1_24 ; CHECK-NEXT: @ %bb.21: @ %for.cond.cleanup17.us63.3 ; CHECK-NEXT: @ in Loop: Header=BB1_17 Depth=2 -; CHECK-NEXT: add.w r10, r2, #28 +; CHECK-NEXT: add.w r9, r2, #28 ; CHECK-NEXT: cmn.w r2, #25 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: blt .LBB1_17 ; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .LBB1_22: @ %for.cond.cleanup5.loopexit134.split.loop.exit139 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r10, r2, #7 +; CHECK-NEXT: add.w r9, r2, #7 ; CHECK-NEXT: b .LBB1_25 ; CHECK-NEXT: .LBB1_23: @ %for.cond.cleanup5.loopexit134.split.loop.exit137 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r10, r2, #14 +; CHECK-NEXT: add.w r9, r2, #14 ; CHECK-NEXT: b .LBB1_25 ; CHECK-NEXT: .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit135 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r10, r2, #21 +; CHECK-NEXT: add.w r9, r2, #21 ; CHECK-NEXT: .LBB1_25: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .LBB1_26: @ %for.inc19.us ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -555,8 +552,8 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: b .LBB1_27 ; CHECK-NEXT: .LBB1_28: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -4,8 +4,8 @@ define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) { ; CHECK-LABEL: gather_inc_mini_4i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.i32 q1, #0x4 -; CHECK-NEXT: vadd.i32 q1, q0, q1 +; CHECK-NEXT: movs r1, #4 +; CHECK-NEXT: vadd.i32 q1, q0, r1 ; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr %1 = add <4 x i32> %offs, @@ -17,10 +17,10 @@ define arm_aapcs_vfpcc <4 x i32> @gather_inc_minipred_4i32(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) { ; CHECK-LABEL: gather_inc_minipred_4i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.i32 q1, #0x4 -; CHECK-NEXT: movw r1, #3855 -; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: movs r1, #4 +; CHECK-NEXT: movw r2, #3855 +; CHECK-NEXT: vadd.i32 q1, q0, r1 +; CHECK-NEXT: vmsr p0, r2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr @@ -33,36 +33,36 @@ define arm_aapcs_vfpcc <8 x i16> @gather_inc_mini_8i16(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, <8 x i32> %offs) { ; CHECK-LABEL: gather_inc_mini_8i16: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vmov.i32 q2, #0x10 -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: mov.w r12, #16 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vmov r0, r2, d1 -; CHECK-NEXT: vmov r1, lr, d2 -; CHECK-NEXT: vmov r3, r12, d3 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r12 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r1, lr, d3 +; CHECK-NEXT: vadd.i32 q0, q0, r12 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: vmov r2, r4, d2 +; CHECK-NEXT: ldrh r6, [r1] +; CHECK-NEXT: vmov r1, r5, d0 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh.w r12, [lr] +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[0], r1 ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: ldrh.w lr, [lr] -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[5], lr -; CHECK-NEXT: ldrh.w r12, [r12] -; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r4 +; CHECK-NEXT: vmov.16 q0[6], r6 ; CHECK-NEXT: vmov.16 q0[7], r12 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} %1 = add <8 x i32> %offs, %2 = getelementptr inbounds i16, i16* %data, <8 x i32> %1 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %2, i32 4, <8 x i1> , <8 x i16> undef) @@ -73,24 +73,24 @@ ; CHECK-LABEL: gather_inc_minipred_8i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vmov.i32 q2, #0x10 +; CHECK-NEXT: movs r1, #16 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vadd.i32 q1, q1, r1 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r1 +; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[6], r1 ; CHECK-NEXT: bx lr %1 = add <8 x i32> %offs, %2 = getelementptr inbounds i16, i16* %data, <8 x i32> %1 @@ -101,39 +101,37 @@ define arm_aapcs_vfpcc <16 x i8> @gather_inc_mini_16i8(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, <16 x i32> %offs) { ; CHECK-LABEL: gather_inc_mini_16i8: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.i32 q4, #0x10 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: movs r5, #16 ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q3, q3, q4 -; CHECK-NEXT: vadd.i32 q2, q2, q4 -; CHECK-NEXT: vmov r1, r2, d7 -; CHECK-NEXT: vmov r3, r4, d6 +; CHECK-NEXT: vadd.i32 q3, q3, r5 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r5, r6, d5 -; CHECK-NEXT: vadd.i32 q3, q0, q4 +; CHECK-NEXT: vmov r1, r2, d7 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q4 -; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: ldrb r1, [r4] -; CHECK-NEXT: ldrb r4, [r5] +; CHECK-NEXT: vmov r3, r4, d6 +; CHECK-NEXT: vadd.i32 q3, q0, r5 +; CHECK-NEXT: vadd.i32 q0, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r5 +; CHECK-NEXT: vadd.i32 q2, q0, r5 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: ldrb r1, [r3] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: ldrb r3, [r4] +; CHECK-NEXT: vmov r2, r4, d6 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q0[0], r2 +; CHECK-NEXT: vmov r2, r6, d5 +; CHECK-NEXT: vmov.8 q0[1], r4 +; CHECK-NEXT: ldrb r4, [r2] ; CHECK-NEXT: ldrb r2, [r6] -; CHECK-NEXT: vmov r5, r6, d6 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q0[0], r5 -; CHECK-NEXT: ldrb r5, [r6] -; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: vmov r5, r6, d7 -; CHECK-NEXT: ldrb r0, [r5] -; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov r6, r7, d7 +; CHECK-NEXT: ldrb r0, [r6] +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[2], r0 ; CHECK-NEXT: vmov r0, r5, d2 -; CHECK-NEXT: vmov.8 q0[3], r6 +; CHECK-NEXT: vmov.8 q0[3], r7 ; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[4], r0 @@ -150,12 +148,11 @@ ; CHECK-NEXT: vmov.8 q0[9], r5 ; CHECK-NEXT: vmov.8 q0[10], r4 ; CHECK-NEXT: vmov.8 q0[11], r2 -; CHECK-NEXT: vmov.8 q0[12], r3 -; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov.8 q0[14], lr -; CHECK-NEXT: vmov.8 q0[15], r12 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.8 q0[12], r1 +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} %1 = add <16 x i32> %offs, %2 = getelementptr inbounds i8, i8* %data, <16 x i32> %1 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %2, i32 2, <16 x i1> , <16 x i8> undef) @@ -167,42 +164,39 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.i32 q4, #0x10 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: movs r1, #16 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vadd.i32 q1, q1, q4 -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vadd.i32 q1, q1, r1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vadd.i32 q2, q2, r1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q0, q0, q4 ; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vadd.i32 q0, q0, r1 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vadd.i32 q1, q3, r0 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q0[0], r2 +; CHECK-NEXT: vmov.8 q0[0], r4 ; CHECK-NEXT: vmov.8 q0[2], r5 ; CHECK-NEXT: vmov.8 q0[4], r12 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[6], r1 -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vadd.i32 q1, q3, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: vmov.8 q0[6], r2 ; CHECK-NEXT: vmov.8 q0[8], lr +; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: vmov.8 q0[10], r3 +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.8 q0[14], r4 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov.8 q0[14], r1 ; CHECK-NEXT: pop {r4, r5, r7, pc} %1 = add <16 x i32> %offs, %2 = getelementptr inbounds i8, i8* %data, <16 x i32> %1 @@ -546,77 +540,73 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #28 +; CHECK-NEXT: sub sp, #28 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: blt .LBB11_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: vmov.i16 q1, #0x8 -; CHECK-NEXT: bic r12, r1, #7 -; CHECK-NEXT: add r1, sp, #8 -; CHECK-NEXT: sub.w r3, r12, #8 -; CHECK-NEXT: add.w r8, r5, r3, lsr #3 -; CHECK-NEXT: adr r5, .LCPI11_0 -; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: add r2, sp, #12 +; CHECK-NEXT: mov.w r9, #8 +; CHECK-NEXT: bic r1, r1, #7 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: sub.w r3, r1, #8 +; CHECK-NEXT: add.w r8, r6, r3, lsr #3 +; CHECK-NEXT: adr r3, .LCPI11_0 +; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: .LBB11_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 ; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB11_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: mov r10, r1 -; CHECK-NEXT: vldrh.s32 q4, [r1, #8] -; CHECK-NEXT: vldrh.s32 q3, [r1] -; CHECK-NEXT: vadd.i16 q2, q2, q1 -; CHECK-NEXT: vshl.i32 q4, q4, #1 -; CHECK-NEXT: vshl.i32 q3, q3, #1 -; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vmov r1, r2, d9 -; CHECK-NEXT: vmov r6, r7, d7 -; CHECK-NEXT: vmov r3, r4, d8 -; CHECK-NEXT: ldrh.w r11, [r2] -; CHECK-NEXT: vmov r2, r9, d6 -; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: vstrw.32 q1, [r2] +; CHECK-NEXT: mov r12, r2 +; CHECK-NEXT: vldrh.s32 q2, [r2, #8] +; CHECK-NEXT: vadd.i16 q1, q1, r9 +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r7, r5, d5 +; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: vldrh.s32 q2, [r2] +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r1, r10, d5 ; CHECK-NEXT: ldrh r7, [r7] -; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: ldrh.w r2, [r10] +; CHECK-NEXT: ldrh.w r10, [r3] +; CHECK-NEXT: vmov r3, r11, d4 ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh.w r9, [r9] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.16 q3[1], r9 -; CHECK-NEXT: vmov.16 q3[2], r6 -; CHECK-NEXT: vmov.16 q3[3], r7 -; CHECK-NEXT: vmov.16 q3[4], r3 -; CHECK-NEXT: vmov.16 q3[5], r4 -; CHECK-NEXT: vmov.16 q3[6], r1 -; CHECK-NEXT: mov r1, r10 -; CHECK-NEXT: vmov.16 q3[7], r11 -; CHECK-NEXT: vstrb.8 q3, [r5], #16 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh.w r11, [r11] +; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vmov.16 q2[1], r11 +; CHECK-NEXT: vmov.16 q2[2], r1 +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: vmov.16 q2[4], r10 +; CHECK-NEXT: vmov.16 q2[5], r4 +; CHECK-NEXT: vmov.16 q2[6], r7 +; CHECK-NEXT: vmov.16 q2[7], r5 +; CHECK-NEXT: vstrb.8 q2, [r6], #16 ; CHECK-NEXT: le lr, .LBB11_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: cmp r12, r2 +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload +; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: bne .LBB11_2 ; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: add sp, #28 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: @@ -1254,95 +1244,101 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #72 -; CHECK-NEXT: sub sp, #72 +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp, #68] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill +; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: str r2, [sp, #60] @ 4-byte Spill ; CHECK-NEXT: blt.w .LBB14_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader ; CHECK-NEXT: adr r5, .LCPI14_3 ; CHECK-NEXT: adr r7, .LCPI14_1 ; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: adr r6, .LCPI14_2 +; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload ; CHECK-NEXT: adr r3, .LCPI14_0 -; CHECK-NEXT: bic r1, r2, #7 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: vmov.i32 q4, #0x10 +; CHECK-NEXT: adr r6, .LCPI14_2 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vldrw.u32 q0, [r7] +; CHECK-NEXT: bic r9, r1, #7 +; CHECK-NEXT: vldrw.u32 q3, [r3] ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: mov.w lr, #16 +; CHECK-NEXT: str.w r9, [sp, #52] @ 4-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB14_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 -; CHECK-NEXT: ldr.w lr, [sp, #68] @ 4-byte Reload +; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: .LBB14_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q2, q6, r0 ; CHECK-NEXT: vadd.i32 q1, q5, r0 -; CHECK-NEXT: vmov r6, r7, d5 -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vmov r4, r5, d3 -; CHECK-NEXT: subs.w r8, r8, #16 -; CHECK-NEXT: vmov r3, r9, d4 +; CHECK-NEXT: vadd.i32 q2, q4, r0 +; CHECK-NEXT: vmov r7, r3, d3 +; CHECK-NEXT: vadd.i32 q6, q0, lr +; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: subs.w r9, r9, #16 +; CHECK-NEXT: vmov r4, r10, d2 +; CHECK-NEXT: vadd.i32 q1, q7, lr +; CHECK-NEXT: vadd.i32 q4, q4, lr +; CHECK-NEXT: vadd.i32 q5, q5, lr +; CHECK-NEXT: ldrb.w r11, [r3] +; CHECK-NEXT: ldrb r3, [r7] +; CHECK-NEXT: vmov r7, r12, d4 ; CHECK-NEXT: vadd.i32 q2, q7, r0 -; CHECK-NEXT: vadd.i32 q5, q5, q4 -; CHECK-NEXT: vadd.i32 q6, q6, q4 -; CHECK-NEXT: vadd.i32 q7, q7, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q4 -; CHECK-NEXT: ldrb.w r11, [r6] -; CHECK-NEXT: ldrb.w r10, [r7] -; CHECK-NEXT: vmov r6, r7, d2 -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vadd.i32 q7, q0, r0 ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb.w r9, [r9] ; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w r10, [r10] ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q1[0], r6 -; CHECK-NEXT: vmov.8 q1[1], r7 -; CHECK-NEXT: vmov r6, r7, d5 -; CHECK-NEXT: vmov.8 q1[2], r4 -; CHECK-NEXT: vmov.8 q1[3], r5 -; CHECK-NEXT: vmov.8 q1[4], r3 -; CHECK-NEXT: vmov.8 q1[5], r9 -; CHECK-NEXT: vmov.8 q1[6], r11 -; CHECK-NEXT: vmov.8 q1[7], r10 +; CHECK-NEXT: ldrb.w r1, [r12] +; CHECK-NEXT: vmov.8 q0[0], r7 +; CHECK-NEXT: vmov.8 q0[1], r1 +; CHECK-NEXT: vmov r1, r7, d15 +; CHECK-NEXT: vmov.8 q0[2], r5 +; CHECK-NEXT: vmov.8 q0[3], r6 +; CHECK-NEXT: vmov.8 q0[4], r4 +; CHECK-NEXT: vmov r4, r2, d4 +; CHECK-NEXT: vmov.8 q0[5], r10 +; CHECK-NEXT: vmov.8 q0[6], r3 +; CHECK-NEXT: vmov.8 q0[7], r11 +; CHECK-NEXT: ldrb r6, [r7] +; CHECK-NEXT: vmov r5, r7, d5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: ldrb.w r12, [r7] -; CHECK-NEXT: vmov r5, r7, d7 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb.w r9, [r7] -; CHECK-NEXT: vmov r7, r3, d6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb.w r11, [r3] -; CHECK-NEXT: vmov r3, r4, d4 -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r5, [r4] +; CHECK-NEXT: vmov r4, r7, d14 +; CHECK-NEXT: vmov q7, q1 ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q1[8], r3 -; CHECK-NEXT: vmov.8 q1[9], r4 -; CHECK-NEXT: vmov.8 q1[10], r6 -; CHECK-NEXT: vmov.8 q1[11], r12 -; CHECK-NEXT: vmov.8 q1[12], r7 -; CHECK-NEXT: vmov.8 q1[13], r11 -; CHECK-NEXT: vmov.8 q1[14], r5 -; CHECK-NEXT: vmov.8 q1[15], r9 -; CHECK-NEXT: vstrb.8 q1, [lr], #16 +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q0[8], r4 +; CHECK-NEXT: vmov.8 q0[9], r7 +; CHECK-NEXT: vmov.8 q0[10], r1 +; CHECK-NEXT: vmov.8 q0[11], r6 +; CHECK-NEXT: vmov.8 q0[12], r5 +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r3 +; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: vstrb.8 q0, [r8], #16 +; CHECK-NEXT: vmov q0, q6 ; CHECK-NEXT: bne .LBB14_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1 -; CHECK-NEXT: cmp r1, r2 +; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload +; CHECK-NEXT: cmp r9, r1 ; CHECK-NEXT: bne .LBB14_2 ; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #72 +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -251,34 +251,34 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vmov.i32 q0, #0x28 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vmov r2, r12, d2 -; CHECK-NEXT: vmov r3, lr, d3 -; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vldrh.s32 q0, [r1, #8] +; CHECK-NEXT: mov.w r12, #40 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r12 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: vmov r2, r4, d1 +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r12 ; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh.w r12, [r12] +; CHECK-NEXT: ldrh.w r12, [lr] +; CHECK-NEXT: ldrh.w lr, [r4] +; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh.w lr, [lr] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] ; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[4], r3 ; CHECK-NEXT: vmov.16 q0[5], r12 -; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[6], r2 ; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -469,29 +469,29 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov.i32 q2, #0x20000 -; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: mov.w r12, #131072 +; CHECK-NEXT: vadd.i32 q0, q0, r12 +; CHECK-NEXT: vadd.i32 q1, q1, r12 ; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: vmov r1, lr, d1 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: vmov r0, r12, d2 +; CHECK-NEXT: vmov r0, r12, d1 +; CHECK-NEXT: vmov r3, lr, d3 +; CHECK-NEXT: vmov r1, r2, d2 ; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] ; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 ; CHECK-NEXT: ldrh r6, [r3] -; CHECK-NEXT: ldrh.w r3, [lr] -; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh.w r3, [r12] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: ldrh.w r12, [r12] -; CHECK-NEXT: vmov.16 q0[4], r0 ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[5], r12 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r6 +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: ldrh.w lr, [lr] +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.16 q0[6], r6 +; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %ptrs2 = getelementptr inbounds i16,<8 x i16*> %base, i32 65536 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll @@ -298,11 +298,11 @@ define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep(i32* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: scaled_i32_i32_2gep: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov.i32 q0, #0x14 -; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: movs r2, #20 +; CHECK-NEXT: vshl.i32 q0, q0, #2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q0, r2 ; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll @@ -460,8 +460,8 @@ define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) { ; CHECK-LABEL: qi4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x10 -; CHECK-NEXT: vadd.i32 q1, q0, q1 +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: @@ -473,8 +473,8 @@ define arm_aapcs_vfpcc <4 x i32> @qi4_unaligned(<4 x i32*> %p) { ; CHECK-LABEL: qi4_unaligned: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x10 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ldr r0, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -306,41 +306,41 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr) { ; CHECK-LABEL: unscaled_v16i8_i8_2gep: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: vldrb.s32 q0, [r1, #12] -; CHECK-NEXT: vmov.i32 q2, #0x5 +; CHECK-NEXT: movs r6, #5 +; CHECK-NEXT: vldrb.s32 q1, [r1, #8] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q0, q0, r6 +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vadd.i32 q1, q1, r6 ; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vldrb.s32 q0, [r1] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q3, q0, q2 -; CHECK-NEXT: vldrb.s32 q0, [r1, #8] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q0, q2 -; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vadd.i32 q2, q0, r6 ; CHECK-NEXT: ldrb.w lr, [r3] ; CHECK-NEXT: ldrb r3, [r4] -; CHECK-NEXT: ldrb r2, [r5] -; CHECK-NEXT: vmov r4, r5, d6 +; CHECK-NEXT: ldrb.w r8, [r5] +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: ldrb.w r12, [r2] ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r4 -; CHECK-NEXT: vmov r4, r6, d3 -; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: ldrb r4, [r5] +; CHECK-NEXT: vmov.8 q0[1], r4 +; CHECK-NEXT: vmov r4, r7, d3 ; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: ldrb r4, [r6] -; CHECK-NEXT: vmov r6, r7, d7 -; CHECK-NEXT: vldrb.s32 q3, [r1, #4] -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: ldrb r0, [r6] -; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: ldrb r4, [r7] +; CHECK-NEXT: vmov r7, r2, d5 +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r6 +; CHECK-NEXT: ldrb r0, [r7] +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: vmov.8 q0[2], r0 ; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov.8 q0[3], r7 +; CHECK-NEXT: vmov.8 q0[3], r2 ; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 @@ -358,10 +358,10 @@ ; CHECK-NEXT: vmov.8 q0[10], r5 ; CHECK-NEXT: vmov.8 q0[11], r4 ; CHECK-NEXT: vmov.8 q0[12], r3 -; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[13], r8 ; CHECK-NEXT: vmov.8 q0[14], r12 ; CHECK-NEXT: vmov.8 q0[15], lr -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs @@ -650,57 +650,54 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep5(<16 x i8*> %base) { ; CHECK-LABEL: unscaled_v16i8_i8_biggep5: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.i32 q4, #0x100 -; CHECK-NEXT: vadd.i32 q3, q3, q4 -; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: mov.w r4, #256 +; CHECK-NEXT: vadd.i32 q3, q3, r4 +; CHECK-NEXT: vadd.i32 q2, q2, r4 ; CHECK-NEXT: vmov r3, r2, d7 -; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: vadd.i32 q1, q1, r4 ; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vadd.i32 q3, q0, q4 -; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vadd.i32 q3, q0, r4 +; CHECK-NEXT: vmov r6, r7, d5 ; CHECK-NEXT: ldrb.w lr, [r3] ; CHECK-NEXT: ldrb r3, [r1] ; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: ldrb r1, [r5] -; CHECK-NEXT: vmov r2, r5, d6 -; CHECK-NEXT: ldrb r4, [r0] -; CHECK-NEXT: ldrb r0, [r6] +; CHECK-NEXT: ldrb r1, [r6] +; CHECK-NEXT: vmov r2, r6, d6 +; CHECK-NEXT: ldrb r5, [r0] +; CHECK-NEXT: ldrb r0, [r7] ; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: vmov r2, r5, d7 +; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: vmov.8 q0[1], r2 +; CHECK-NEXT: vmov r2, r6, d7 ; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: vmov.8 q0[2], r2 -; CHECK-NEXT: vmov.8 q0[3], r5 -; CHECK-NEXT: vmov r2, r5, d2 +; CHECK-NEXT: vmov r2, r4, d2 +; CHECK-NEXT: vmov.8 q0[3], r6 ; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: vmov.8 q0[4], r2 -; CHECK-NEXT: vmov.8 q0[5], r5 -; CHECK-NEXT: vmov r2, r5, d3 +; CHECK-NEXT: vmov.8 q0[5], r4 +; CHECK-NEXT: vmov r2, r4, d3 ; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: vmov.8 q0[6], r2 -; CHECK-NEXT: vmov.8 q0[7], r5 -; CHECK-NEXT: vmov r2, r5, d4 +; CHECK-NEXT: vmov.8 q0[7], r4 +; CHECK-NEXT: vmov r2, r4, d4 ; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: vmov.8 q0[8], r2 -; CHECK-NEXT: vmov.8 q0[9], r5 +; CHECK-NEXT: vmov.8 q0[9], r4 ; CHECK-NEXT: vmov.8 q0[10], r1 ; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.8 q0[12], r4 +; CHECK-NEXT: vmov.8 q0[12], r5 ; CHECK-NEXT: vmov.8 q0[13], r3 ; CHECK-NEXT: vmov.8 q0[14], lr ; CHECK-NEXT: vmov.8 q0[15], r12 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %ptrs2 = getelementptr inbounds i8, <16 x i8*> %base, i32 256 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -776,8 +776,8 @@ define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) { ; CHECK-LABEL: qi4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q1, #0x10 -; CHECK-NEXT: vadd.i32 q1, q0, q1 +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -325,27 +325,31 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) { ; CHECK-LABEL: non_gatscat_use1: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: adr.w r12, .LCPI7_0 -; CHECK-NEXT: vmov.i32 q0, #0x9 -; CHECK-NEXT: vldrw.u32 q3, [r12] -; CHECK-NEXT: vmov.i32 q1, #0xc -; CHECK-NEXT: vmov.i32 q2, #0x8 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: adr r4, .LCPI7_0 +; CHECK-NEXT: mov.w r12, #9 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: mov.w lr, #12 +; CHECK-NEXT: movs r4, #8 +; CHECK-NEXT: vdup.32 q0, r0 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q3, q2 -; CHECK-NEXT: vmul.i32 q5, q3, q0 -; CHECK-NEXT: vmlas.u32 q3, q1, r0 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vadd.i32 q2, q1, r4 +; CHECK-NEXT: vmla.u32 q3, q1, lr +; CHECK-NEXT: vmul.i32 q1, q1, r12 +; CHECK-NEXT: vldrw.u32 q4, [q3, #24] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vldrw.u32 q6, [q3, #24] -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vstrw.32 q5, [r3] -; CHECK-NEXT: vstrb.8 q6, [r1], #16 +; CHECK-NEXT: vstrw.32 q1, [r3] +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vstrb.8 q4, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI7_0: @@ -381,29 +385,34 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec, <4 x i32>* %x) { ; CHECK-LABEL: non_gatscat_use2: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: adr.w r12, .LCPI8_0 -; CHECK-NEXT: vmov.i32 q0, #0x12 -; CHECK-NEXT: vldrw.u32 q4, [r12] -; CHECK-NEXT: vmov.i32 q1, #0x9 -; CHECK-NEXT: vmov.i32 q2, #0x8 -; CHECK-NEXT: vmov.i32 q3, #0xc +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: adr r4, .LCPI8_0 +; CHECK-NEXT: movs r5, #18 +; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: mov.w r12, #9 +; CHECK-NEXT: mov.w lr, #12 +; CHECK-NEXT: movs r4, #8 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vdup.32 q1, r5 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q5, q4, q2 -; CHECK-NEXT: vmul.i32 q6, q4, q1 -; CHECK-NEXT: vmlas.u32 q4, q3, r0 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vadd.i32 q3, q2, r4 +; CHECK-NEXT: vmla.u32 q4, q2, lr ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vldrw.u32 q7, [q4, #24] -; CHECK-NEXT: vadd.i32 q4, q6, q0 +; CHECK-NEXT: vldrw.u32 q5, [q4, #24] +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmla.u32 q4, q2, r12 +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vstrb.8 q5, [r1], #16 ; CHECK-NEXT: vstrw.32 q4, [r3] -; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: vstrb.8 q7, [r1], #16 ; CHECK-NEXT: bne .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI8_0: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll @@ -607,35 +607,35 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v4i32(i32* nocapture readonly %x, i32* nocapture %z, i32 %n) { ; CHECK-LABEL: three_pointer_iv_v4i32: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr.w r12, .LCPI10_0 ; CHECK-NEXT: adr.w lr, .LCPI10_1 -; CHECK-NEXT: adr r3, .LCPI10_2 -; CHECK-NEXT: vldrw.u32 q2, [lr] -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: vldrw.u32 q3, [r12] -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: adr r4, .LCPI10_2 +; CHECK-NEXT: vldrw.u32 q1, [lr] +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q3, [r0, q0, uxtw #2] ; CHECK-NEXT: vldrw.u32 q4, [r0, q1, uxtw #2] ; CHECK-NEXT: vldrw.u32 q5, [r0, q2, uxtw #2] -; CHECK-NEXT: vldrw.u32 q6, [r0, q3, uxtw #2] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmul.i32 q4, q5, q4 +; CHECK-NEXT: vmul.i32 q3, q4, q3 ; CHECK-NEXT: add.w r0, r0, #48 -; CHECK-NEXT: vmul.i32 q6, q5, q6 -; CHECK-NEXT: vmul.i32 q5, q5, q0 -; CHECK-NEXT: vstrw.32 q5, [r1, q2, uxtw #2] -; CHECK-NEXT: vstrw.32 q6, [r1, q3, uxtw #2] +; CHECK-NEXT: vmul.i32 q5, q4, q5 +; CHECK-NEXT: vmul.i32 q4, q4, r3 ; CHECK-NEXT: vstrw.32 q4, [r1, q1, uxtw #2] +; CHECK-NEXT: vstrw.32 q5, [r1, q2, uxtw #2] +; CHECK-NEXT: vstrw.32 q3, [r1, q0, uxtw #2] ; CHECK-NEXT: add.w r1, r1, #48 ; CHECK-NEXT: bne .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI10_0: @@ -689,35 +689,35 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v4i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { ; CHECK-LABEL: three_pointer_iv_v4i8: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr.w r12, .LCPI11_0 ; CHECK-NEXT: adr.w lr, .LCPI11_1 -; CHECK-NEXT: adr r3, .LCPI11_2 -; CHECK-NEXT: vldrw.u32 q2, [lr] -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: vldrw.u32 q3, [r12] -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: adr r4, .LCPI11_2 +; CHECK-NEXT: vldrw.u32 q1, [lr] +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: .LBB11_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u32 q3, [r0, q0] ; CHECK-NEXT: vldrb.u32 q4, [r0, q1] ; CHECK-NEXT: vldrb.u32 q5, [r0, q2] -; CHECK-NEXT: vldrb.u32 q6, [r0, q3] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmul.i32 q4, q5, q4 +; CHECK-NEXT: vmul.i32 q3, q4, q3 ; CHECK-NEXT: add.w r0, r0, #12 -; CHECK-NEXT: vmul.i32 q6, q5, q6 -; CHECK-NEXT: vmul.i32 q5, q5, q0 -; CHECK-NEXT: vstrb.32 q5, [r1, q2] -; CHECK-NEXT: vstrb.32 q6, [r1, q3] +; CHECK-NEXT: vmul.i32 q5, q4, q5 +; CHECK-NEXT: vmul.i32 q4, q4, r3 ; CHECK-NEXT: vstrb.32 q4, [r1, q1] +; CHECK-NEXT: vstrb.32 q5, [r1, q2] +; CHECK-NEXT: vstrb.32 q3, [r1, q0] ; CHECK-NEXT: add.w r1, r1, #12 ; CHECK-NEXT: bne .LBB11_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI11_0: @@ -777,35 +777,35 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x, i16* nocapture %z, i32 %n) { ; CHECK-LABEL: three_pointer_iv_v8i16: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr.w r12, .LCPI12_0 ; CHECK-NEXT: adr.w lr, .LCPI12_1 -; CHECK-NEXT: adr r3, .LCPI12_2 -; CHECK-NEXT: vldrw.u32 q2, [lr] -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: vldrw.u32 q3, [r12] -; CHECK-NEXT: vmov.i16 q0, #0xa +; CHECK-NEXT: adr r4, .LCPI12_2 +; CHECK-NEXT: vldrw.u32 q1, [lr] +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: .LBB12_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q3, [r0, q0, uxtw #1] ; CHECK-NEXT: vldrh.u16 q4, [r0, q1, uxtw #1] ; CHECK-NEXT: vldrh.u16 q5, [r0, q2, uxtw #1] -; CHECK-NEXT: vldrh.u16 q6, [r0, q3, uxtw #1] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmul.i16 q4, q5, q4 +; CHECK-NEXT: vmul.i16 q3, q4, q3 ; CHECK-NEXT: add.w r0, r0, #48 -; CHECK-NEXT: vmul.i16 q6, q5, q6 -; CHECK-NEXT: vmul.i16 q5, q5, q0 -; CHECK-NEXT: vstrh.16 q5, [r1, q2, uxtw #1] -; CHECK-NEXT: vstrh.16 q6, [r1, q3, uxtw #1] +; CHECK-NEXT: vmul.i16 q5, q4, q5 +; CHECK-NEXT: vmul.i16 q4, q4, r3 ; CHECK-NEXT: vstrh.16 q4, [r1, q1, uxtw #1] +; CHECK-NEXT: vstrh.16 q5, [r1, q2, uxtw #1] +; CHECK-NEXT: vstrh.16 q3, [r1, q0, uxtw #1] ; CHECK-NEXT: add.w r1, r1, #48 ; CHECK-NEXT: bne .LBB12_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI12_0: @@ -871,35 +871,35 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v16i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { ; CHECK-LABEL: three_pointer_iv_v16i8: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr.w r12, .LCPI13_0 ; CHECK-NEXT: adr.w lr, .LCPI13_1 -; CHECK-NEXT: adr r3, .LCPI13_2 -; CHECK-NEXT: vldrw.u32 q2, [lr] -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: vldrw.u32 q3, [r12] -; CHECK-NEXT: vmov.i8 q0, #0xa +; CHECK-NEXT: adr r4, .LCPI13_2 +; CHECK-NEXT: vldrw.u32 q1, [lr] +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: .LBB13_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q3, [r0, q0] ; CHECK-NEXT: vldrb.u8 q4, [r0, q1] ; CHECK-NEXT: vldrb.u8 q5, [r0, q2] -; CHECK-NEXT: vldrb.u8 q6, [r0, q3] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmul.i8 q4, q5, q4 +; CHECK-NEXT: vmul.i8 q3, q4, q3 ; CHECK-NEXT: add.w r0, r0, #48 -; CHECK-NEXT: vmul.i8 q6, q5, q6 -; CHECK-NEXT: vmul.i8 q5, q5, q0 -; CHECK-NEXT: vstrb.8 q5, [r1, q2] -; CHECK-NEXT: vstrb.8 q6, [r1, q3] +; CHECK-NEXT: vmul.i8 q5, q4, q5 +; CHECK-NEXT: vmul.i8 q4, q4, r3 ; CHECK-NEXT: vstrb.8 q4, [r1, q1] +; CHECK-NEXT: vstrb.8 q5, [r1, q2] +; CHECK-NEXT: vstrb.8 q3, [r1, q0] ; CHECK-NEXT: add.w r1, r1, #48 ; CHECK-NEXT: bne .LBB13_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI13_0: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -143,26 +143,25 @@ ; CHECK-NEXT: add.w r4, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI2_0 ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: add.w r12, r3, #4 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: movw r12, #1250 -; CHECK-NEXT: vmov.i32 q2, #0x3 +; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vadd.i32 q1, q1, r1 -; CHECK-NEXT: adds r1, r3, #4 -; CHECK-NEXT: dls lr, r12 +; CHECK-NEXT: movs r1, #3 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q3, [r4], #16 -; CHECK-NEXT: vmul.i32 q3, q3, q2 +; CHECK-NEXT: vldrwt.u32 q2, [r4], #16 +; CHECK-NEXT: vmul.i32 q3, q2, r1 +; CHECK-NEXT: vmla.u32 q0, q2, r1 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q3, [q1, #80]! -; CHECK-NEXT: vadd.i32 q0, q0, q3 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: str.w r0, [r2, r1, lsl #2] +; CHECK-NEXT: str.w r0, [r2, r12, lsl #2] ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -208,72 +207,65 @@ define void @justoffsets(i8* noalias nocapture readonly %r, i8* noalias nocapture %w, i32 %N) { ; CHECK-LABEL: justoffsets: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq .LBB3_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: adr r5, .LCPI3_1 +; CHECK-NEXT: adr r4, .LCPI3_0 +; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: adr r5, .LCPI3_2 +; CHECK-NEXT: movw r9, #47888 +; CHECK-NEXT: movw r10, #50417 ; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: adr r4, .LCPI3_1 -; CHECK-NEXT: movw r5, #50417 -; CHECK-NEXT: adr r3, .LCPI3_0 -; CHECK-NEXT: movw r7, #32769 ; CHECK-NEXT: vldrw.u32 q2, [r4] -; CHECK-NEXT: movw r4, #47888 -; CHECK-NEXT: vldrw.u32 q3, [r3] -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.i32 q1, #0x7fff -; CHECK-NEXT: vmov.i32 q0, #0x8000 +; CHECK-NEXT: movw r4, #32769 ; CHECK-NEXT: movw r12, #7471 -; CHECK-NEXT: movw r9, #19595 -; CHECK-NEXT: movw r8, #38470 +; CHECK-NEXT: mov.w r3, #32768 +; CHECK-NEXT: movw r11, #38470 +; CHECK-NEXT: movw r8, #19595 +; CHECK-NEXT: movt r9, #65535 +; CHECK-NEXT: movt r10, #65535 +; CHECK-NEXT: movw r7, #32767 ; CHECK-NEXT: movt r4, #65535 -; CHECK-NEXT: movt r5, #65535 +; CHECK-NEXT: movw r5, #13282 ; CHECK-NEXT: movw r6, #19485 -; CHECK-NEXT: movt r7, #65535 -; CHECK-NEXT: movw r3, #13282 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrb.u32 q4, [r0, q0] +; CHECK-NEXT: vldrb.u32 q3, [r0, q1] ; CHECK-NEXT: vldrb.u32 q5, [r0, q2] -; CHECK-NEXT: vldrb.u32 q6, [r0, q3] -; CHECK-NEXT: vldrb.u32 q7, [r0, q1] ; CHECK-NEXT: adds r0, #12 -; CHECK-NEXT: vmul.i32 q4, q5, r8 -; CHECK-NEXT: vmla.u32 q4, q7, r9 -; CHECK-NEXT: vmla.u32 q4, q6, r12 -; CHECK-NEXT: vadd.i32 q4, q4, q0 -; CHECK-NEXT: vshr.u32 q4, q4, #16 -; CHECK-NEXT: vstrb.32 q4, [r1, q1] -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vmul.i32 q4, q7, q1 -; CHECK-NEXT: vmul.i32 q1, q5, r7 -; CHECK-NEXT: vmla.u32 q1, q7, r3 -; CHECK-NEXT: vmla.u32 q4, q5, r5 -; CHECK-NEXT: vmla.u32 q1, q6, r6 -; CHECK-NEXT: vmla.u32 q4, q6, r4 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vadd.i32 q4, q4, q0 -; CHECK-NEXT: vshr.u32 q4, q4, #16 -; CHECK-NEXT: vshr.u32 q1, q1, #16 -; CHECK-NEXT: vstrb.32 q4, [r1, q2] -; CHECK-NEXT: vstrb.32 q1, [r1, q3] +; CHECK-NEXT: vmul.i32 q6, q4, r11 +; CHECK-NEXT: vmla.u32 q6, q3, r8 +; CHECK-NEXT: vmla.u32 q6, q5, r12 +; CHECK-NEXT: vadd.i32 q6, q6, r3 +; CHECK-NEXT: vshr.u32 q6, q6, #16 +; CHECK-NEXT: vstrb.32 q6, [r1, q1] +; CHECK-NEXT: vmul.i32 q6, q4, r4 +; CHECK-NEXT: vmul.i32 q4, q4, r10 +; CHECK-NEXT: vmla.u32 q6, q3, r5 +; CHECK-NEXT: vmla.u32 q4, q3, r7 +; CHECK-NEXT: vmla.u32 q6, q5, r6 +; CHECK-NEXT: vmla.u32 q4, q5, r9 +; CHECK-NEXT: vadd.i32 q6, q6, r3 +; CHECK-NEXT: vadd.i32 q3, q4, r3 +; CHECK-NEXT: vshr.u32 q6, q6, #16 +; CHECK-NEXT: vshr.u32 q3, q3, #16 +; CHECK-NEXT: vstrb.32 q3, [r1, q0] +; CHECK-NEXT: vstrb.32 q6, [r1, q2] ; CHECK-NEXT: adds r1, #12 ; CHECK-NEXT: letp lr, .LBB3_2 ; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI3_0: diff --git a/llvm/test/CodeGen/Thumb2/mve-halving.ll b/llvm/test/CodeGen/Thumb2/mve-halving.ll --- a/llvm/test/CodeGen/Thumb2/mve-halving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-halving.ll @@ -234,8 +234,8 @@ ; CHECK-LABEL: vrhadds_v16i8: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vmov.i8 q1, #0x1 -; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i8 q0, q0, r0 ; CHECK-NEXT: vshr.s8 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add <16 x i8> %x, %y @@ -247,8 +247,8 @@ ; CHECK-LABEL: vrhaddu_v16i8: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vmov.i8 q1, #0x1 -; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i8 q0, q0, r0 ; CHECK-NEXT: vshr.u8 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add <16 x i8> %x, %y @@ -260,8 +260,8 @@ ; CHECK-LABEL: vrhadds_v8i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vmov.i16 q1, #0x1 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i16 q0, q0, r0 ; CHECK-NEXT: vshr.s16 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add <8 x i16> %x, %y @@ -273,8 +273,8 @@ ; CHECK-LABEL: vrhaddu_v8i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vmov.i16 q1, #0x1 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i16 q0, q0, r0 ; CHECK-NEXT: vshr.u16 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add <8 x i16> %x, %y @@ -286,8 +286,8 @@ ; CHECK-LABEL: vrhadds_v4i32: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vmov.i32 q1, #0x1 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vshr.s32 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add <4 x i32> %x, %y @@ -299,8 +299,8 @@ ; CHECK-LABEL: vrhaddu_v4i32: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vmov.i32 q1, #0x1 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vshr.u32 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add <4 x i32> %x, %y @@ -312,8 +312,8 @@ ; CHECK-LABEL: vrhadds_v16i8_nwop: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vmov.i8 q1, #0x1 -; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i8 q0, q0, r0 ; CHECK-NEXT: vshr.s8 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add nsw <16 x i8> %x, %y @@ -325,8 +325,8 @@ ; CHECK-LABEL: vrhaddu_v16i8_nwop: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vmov.i8 q1, #0x1 -; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i8 q0, q0, r0 ; CHECK-NEXT: vshr.u8 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add nuw <16 x i8> %x, %y @@ -338,8 +338,8 @@ ; CHECK-LABEL: vrhadds_v8i16_nwop: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vmov.i16 q1, #0x1 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i16 q0, q0, r0 ; CHECK-NEXT: vshr.s16 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add nsw <8 x i16> %x, %y @@ -351,8 +351,8 @@ ; CHECK-LABEL: vrhaddu_v8i16_nwop: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vmov.i16 q1, #0x1 -; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i16 q0, q0, r0 ; CHECK-NEXT: vshr.u16 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add nuw <8 x i16> %x, %y @@ -364,8 +364,8 @@ ; CHECK-LABEL: vrhadds_v4i32_nwop: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vmov.i32 q1, #0x1 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vshr.s32 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add nsw <4 x i32> %x, %y @@ -377,8 +377,8 @@ ; CHECK-LABEL: vrhaddu_v4i32_nwop: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vmov.i32 q1, #0x1 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vshr.u32 q0, q0, #1 ; CHECK-NEXT: bx lr %add = add nuw <4 x i32> %x, %y @@ -390,8 +390,8 @@ ; CHECK-LABEL: vrhadds_v16i8_nwrnd: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vmov.i8 q1, #0x1 -; CHECK-NEXT: vhadd.s8 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vhadd.s8 q0, q0, r0 ; CHECK-NEXT: bx lr %add = add <16 x i8> %x, %y %round = add nsw <16 x i8> %add, @@ -402,8 +402,8 @@ ; CHECK-LABEL: vrhaddu_v16i8_nwrnd: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vmov.i8 q1, #0x1 -; CHECK-NEXT: vhadd.u8 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vhadd.u8 q0, q0, r0 ; CHECK-NEXT: bx lr %add = add <16 x i8> %x, %y %round = add nuw <16 x i8> %add, @@ -414,8 +414,8 @@ ; CHECK-LABEL: vrhadds_v8i16_nwrnd: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vmov.i16 q1, #0x1 -; CHECK-NEXT: vhadd.s16 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vhadd.s16 q0, q0, r0 ; CHECK-NEXT: bx lr %add = add <8 x i16> %x, %y %round = add nsw <8 x i16> %add, @@ -426,8 +426,8 @@ ; CHECK-LABEL: vrhaddu_v8i16_nwrnd: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vmov.i16 q1, #0x1 -; CHECK-NEXT: vhadd.u16 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vhadd.u16 q0, q0, r0 ; CHECK-NEXT: bx lr %add = add <8 x i16> %x, %y %round = add nuw <8 x i16> %add, @@ -438,8 +438,8 @@ ; CHECK-LABEL: vrhadds_v4i32_nwrnd: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vmov.i32 q1, #0x1 -; CHECK-NEXT: vhadd.s32 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vhadd.s32 q0, q0, r0 ; CHECK-NEXT: bx lr %add = add <4 x i32> %x, %y %round = add nsw <4 x i32> %add, @@ -450,8 +450,8 @@ ; CHECK-LABEL: vrhaddu_v4i32_nwrnd: ; CHECK: @ %bb.0: ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vmov.i32 q1, #0x1 -; CHECK-NEXT: vhadd.u32 q0, q0, q1 +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: vhadd.u32 q0, q0, r0 ; CHECK-NEXT: bx lr %add = add <4 x i32> %x, %y %round = add nuw <4 x i32> %add, diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -63,24 +63,24 @@ ; CHECK-NEXT: .LBB0_5: @ %vector.body105.preheader ; CHECK-NEXT: vldrw.u32 q0, [r8] ; CHECK-NEXT: vldrw.u32 q1, [r9] -; CHECK-NEXT: vmov.i32 q2, #0x8 +; CHECK-NEXT: movs r0, #8 ; CHECK-NEXT: .LBB0_6: @ %vector.body105 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: cbz r6, .LBB0_7 ; CHECK-NEXT: le .LBB0_6 ; CHECK-NEXT: .LBB0_7: @ %vector.body115.ph ; CHECK-NEXT: vldrw.u32 q0, [r9] +; CHECK-NEXT: movs r0, #4 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: @APP ; CHECK-NEXT: nop ; CHECK-NEXT: @NO_APP -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.i32 q0, #0x4 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: .LBB0_8: @ %vector.body115 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: b .LBB0_8 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.9: diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -4,25 +4,23 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture %pResult, i32* nocapture %pIndex) { ; CHECK-LABEL: arm_min_helium_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: vidup.u32 q2, r4, #1 +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: mov.w r12, #4 +; CHECK-NEXT: vidup.u32 q2, r6, #1 ; CHECK-NEXT: movw r4, #54437 ; CHECK-NEXT: movt r4, #21352 -; CHECK-NEXT: vdup.32 q1, r4 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vmov.i32 q3, #0x4 +; CHECK-NEXT: vdup.32 q1, r4 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [r0], #16 -; CHECK-NEXT: vptt.f32 ge, q1, q4 +; CHECK-NEXT: vldrw.u32 q3, [r0], #16 +; CHECK-NEXT: vptt.f32 ge, q1, q3 ; CHECK-NEXT: vmovt q0, q2 -; CHECK-NEXT: vmovt q1, q4 -; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vmovt q1, q3 +; CHECK-NEXT: vadd.i32 q2, q2, r12 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: vldr s8, .LCPI0_0 @@ -35,8 +33,7 @@ ; CHECK-NEXT: vminv.u32 r1, q0 ; CHECK-NEXT: str r1, [r3] ; CHECK-NEXT: vstr s8, [r2] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r6, r7, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: diff --git a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll --- a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll @@ -10,13 +10,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB0_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vadd.i32 q0, q0, r3 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -97,13 +97,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB2_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vsub.i32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vsub.i32 q0, q0, r3 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -140,13 +140,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB3_1: @ %while.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r2, #10 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB3_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vsub.i32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vsub.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -183,13 +183,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB4_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmul.i32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vmul.i32 q0, q0, r3 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB4_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -226,13 +226,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB5_1: @ %while.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r2, #10 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB5_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmul.i32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmul.i32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -269,13 +269,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB6_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB6_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vqadd.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vqadd.s32 q0, q0, r3 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB6_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -312,13 +312,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB7_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB7_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vqadd.u32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vqadd.u32 q0, q0, r3 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB7_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -355,13 +355,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB8_1: @ %while.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r2, #10 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB8_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vqadd.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqadd.s32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB8_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -398,13 +398,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB9_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB9_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vqsub.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vqsub.s32 q0, q0, r3 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB9_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -441,13 +441,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB10_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB10_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vqsub.u32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vqsub.u32 q0, q0, r3 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB10_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -484,13 +484,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB11_1: @ %while.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r2, #10 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB11_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vqsub.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqsub.s32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB11_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -527,13 +527,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB12_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB12_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vhadd.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vhadd.s32 q0, q0, r3 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB12_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -570,13 +570,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB13_1: @ %while.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r2, #10 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB13_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vhadd.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vhadd.s32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB13_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -613,13 +613,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB14_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB14_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vhsub.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vhsub.s32 q0, q0, r3 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB14_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -656,13 +656,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB15_1: @ %while.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r2, #10 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB15_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vhsub.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vhsub.s32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB15_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -699,13 +699,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB16_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB16_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vqdmullb.s32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vqdmullb.s32 q1, q0, r3 +; CHECK-NEXT: vstrw.32 q1, [r1], #16 ; CHECK-NEXT: letp lr, .LBB16_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -744,13 +744,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB17_1: @ %while.body.preheader -; CHECK-NEXT: vmov.i16 q0, #0xa +; CHECK-NEXT: movs r2, #10 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB17_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.s32 q1, [r0] -; CHECK-NEXT: vqdmullb.s16 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vqdmullb.s16 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB17_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -790,13 +790,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB18_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB18_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vqdmulh.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vqdmulh.s32 q0, q0, r3 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB18_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -833,13 +833,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB19_1: @ %while.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r2, #10 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB19_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vqdmulh.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqdmulh.s32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB19_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -876,13 +876,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB20_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB20_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vqrdmulh.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vqrdmulh.s32 q0, q0, r3 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB20_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -919,13 +919,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB21_1: @ %while.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r2, #10 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB21_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vqrdmulh.s32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vqrdmulh.s32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB21_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -962,14 +962,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB22_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB22_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vmul.i32 q1, q1, q0 -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vmla.u32 q1, q0, r3 ; CHECK-NEXT: vstrw.32 q1, [r1], #16 ; CHECK-NEXT: letp lr, .LBB22_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup @@ -1054,14 +1053,13 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB24_1: @ %for.body.preheader -; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: movs r3, #10 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB24_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmlas.u32 q1, q0, r3 ; CHECK-NEXT: vstrw.32 q1, [r1], #16 ; CHECK-NEXT: letp lr, .LBB24_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup @@ -1189,13 +1187,14 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB27_1: @ %while.body.preheader -; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: movt r2, #16672 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB27_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vadd.f32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vadd.f32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB27_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -1275,13 +1274,14 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB29_1: @ %while.body.preheader -; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: movt r2, #16672 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB29_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vsub.f32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vsub.f32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB29_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -1361,13 +1361,14 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB31_1: @ %while.body.preheader -; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: movt r2, #16672 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB31_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB31_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} @@ -1582,85 +1583,63 @@ define void @rgbconvert(i32* noalias %pwSourceBase, i16 signext %iSourceStride, i16* noalias %phwTargetBase, i16 signext %iTargetStride, i16 %iHeight, i16 %iWidth) { ; CHECK-LABEL: rgbconvert: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: ldrsh.w r12, [sp, #128] -; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldrsh.w r3, [sp, #80] +; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: blt .LBB36_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: ldr r7, [sp, #132] -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: vmov.i32 q7, #0xf800 -; CHECK-NEXT: vmov.i32 q1, #0x800000 -; CHECK-NEXT: vmov.i32 q0, #0x10000000 -; CHECK-NEXT: sxth.w r9, r7 -; CHECK-NEXT: cmp.w r9, #4 -; CHECK-NEXT: mov r7, r9 -; CHECK-NEXT: vmov.i32 q2, #0x1f -; CHECK-NEXT: it ge -; CHECK-NEXT: movge r7, #4 -; CHECK-NEXT: vmov.i32 q5, #0x4000000 -; CHECK-NEXT: sub.w r7, r9, r7 -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: adds r7, #3 -; CHECK-NEXT: add.w r8, r6, r7, lsr #2 -; CHECK-NEXT: mov.w r7, #2016 -; CHECK-NEXT: vdup.32 q4, r7 -; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill +; CHECK-NEXT: mov r9, r2 +; CHECK-NEXT: ldr r2, [sp, #84] +; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: mov.w r11, #8388608 +; CHECK-NEXT: mov.w r4, #67108864 +; CHECK-NEXT: sxth.w r12, r2 +; CHECK-NEXT: vmov.i32 q0, #0xf800 +; CHECK-NEXT: vmov.i32 q1, #0x1f +; CHECK-NEXT: mov.w r2, #2016 +; CHECK-NEXT: mov.w r7, #268435456 +; CHECK-NEXT: vdup.32 q2, r2 ; CHECK-NEXT: .LBB36_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB36_3 Depth 2 -; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r4, r9 -; CHECK-NEXT: dls lr, r8 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB36_3: @ %do.body ; CHECK-NEXT: @ Parent Loop BB36_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.32 r4 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q6, [r7], #16 -; CHECK-NEXT: vmov q1, q7 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: subs r4, #4 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vqdmulht.s32 q7, q6, q5 -; CHECK-NEXT: vqdmulht.s32 q0, q6, q0 -; CHECK-NEXT: vand q7, q7, q4 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vqdmulht.s32 q6, q6, q2 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vorr q0, q7, q0 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vand q6, q6, q7 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vorr q0, q0, q6 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.32 q0, [r6], #8 -; CHECK-NEXT: vmov q0, q3 -; CHECK-NEXT: le lr, .LBB36_3 +; CHECK-NEXT: vldrw.u32 q3, [r5], #16 +; CHECK-NEXT: vqdmulh.s32 q4, q3, r4 +; CHECK-NEXT: vqdmulh.s32 q5, q3, r7 +; CHECK-NEXT: vqdmulh.s32 q3, q3, r11 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vand q5, q5, q1 +; CHECK-NEXT: vand q3, q3, q0 +; CHECK-NEXT: vorr q4, q4, q5 +; CHECK-NEXT: vorr q3, q4, q3 +; CHECK-NEXT: vstrh.32 q3, [r2], #8 +; CHECK-NEXT: letp lr, .LBB36_3 ; CHECK-NEXT: @ %bb.4: @ %do.end ; CHECK-NEXT: @ in Loop: Header=BB36_2 Depth=1 -; CHECK-NEXT: adds r5, #1 -; CHECK-NEXT: add.w r2, r2, r3, lsl #1 +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add.w r10, r10, #1 ; CHECK-NEXT: add.w r0, r0, r1, lsl #2 -; CHECK-NEXT: cmp r5, r12 +; CHECK-NEXT: cmp r10, r3 +; CHECK-NEXT: add.w r9, r9, r2, lsl #1 ; CHECK-NEXT: bne .LBB36_2 ; CHECK-NEXT: .LBB36_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %conv = sext i16 %iHeight to i32 %conv9 = sext i16 %iSourceStride to i32 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -6,10 +6,10 @@ define arm_aapcs_vfpcc void @scatter_inc_minipred_4i32(<4 x i32> %data, i32* %dst, <4 x i32> %offs) { ; CHECK-LABEL: scatter_inc_minipred_4i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: movw r1, #3855 -; CHECK-NEXT: vmov.i32 q2, #0x4 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: movs r1, #4 +; CHECK-NEXT: movw r2, #3855 +; CHECK-NEXT: vadd.i32 q1, q1, r1 +; CHECK-NEXT: vmsr p0, r2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr @@ -25,28 +25,28 @@ ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vmov.i32 q3, #0x10 +; CHECK-NEXT: mov.w r12, #16 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov.u16 r6, q0[0] -; CHECK-NEXT: vadd.i32 q1, q1, q3 -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vmov r3, r12, d3 +; CHECK-NEXT: vadd.i32 q1, q1, r12 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov r1, lr, d3 ; CHECK-NEXT: vshl.i32 q1, q2, #1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q3 -; CHECK-NEXT: vmov r0, lr, d2 +; CHECK-NEXT: vadd.i32 q1, q1, r12 +; CHECK-NEXT: vmov r0, r12, d2 ; CHECK-NEXT: vmov r4, r5, d3 -; CHECK-NEXT: strh r6, [r1] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strh r1, [r2] -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r3] +; CHECK-NEXT: strh r6, [r2] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: strh r2, [r3] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: strh r2, [r1] ; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh.w r1, [r12] +; CHECK-NEXT: strh.w r1, [lr] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: strh r1, [r0] ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: strh.w r0, [lr] +; CHECK-NEXT: strh.w r0, [r12] ; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: strh r0, [r4] ; CHECK-NEXT: vmov.u16 r0, q0[7] @@ -65,60 +65,57 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.i32 q4, #0x10 +; CHECK-NEXT: movs r1, #16 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q4 -; CHECK-NEXT: add r5, sp, #48 -; CHECK-NEXT: vmov r1, r2, d2 +; CHECK-NEXT: vadd.i32 q1, q1, r1 +; CHECK-NEXT: add.w r12, sp, #32 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vmov r3, r12, d3 +; CHECK-NEXT: vmov lr, r5, d3 ; CHECK-NEXT: vadd.i32 q1, q2, r0 -; CHECK-NEXT: vadd.i32 q2, q1, q4 -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vmov lr, r7, d4 +; CHECK-NEXT: vadd.i32 q2, q1, r1 +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vmov r4, r12, d4 ; CHECK-NEXT: vmov.u8 r6, q0[0] ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov r0, r8, d5 -; CHECK-NEXT: vadd.i32 q2, q3, q4 -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: vadd.i32 q1, q1, q4 -; CHECK-NEXT: vmov.u8 r5, q0[6] -; CHECK-NEXT: strb r6, [r1] -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: strb r1, [r2] +; CHECK-NEXT: vadd.i32 q3, q3, r1 +; CHECK-NEXT: vadd.i32 q1, q1, r1 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.u8 r7, q0[6] +; CHECK-NEXT: strb r6, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: strb r2, [r3] ; CHECK-NEXT: vmov.u8 r6, q0[2] -; CHECK-NEXT: vmov r1, r9, d4 -; CHECK-NEXT: strb r6, [r3] -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: strb.w r3, [r12] -; CHECK-NEXT: vmov r3, r6, d5 -; CHECK-NEXT: strb.w r4, [lr] -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: strb r4, [r7] -; CHECK-NEXT: vmov r7, r4, d2 -; CHECK-NEXT: strb r5, [r0] +; CHECK-NEXT: vmov r2, r9, d6 +; CHECK-NEXT: strb.w r6, [lr] +; CHECK-NEXT: vmov.u8 r6, q0[3] +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: strb r6, [r5] +; CHECK-NEXT: vmov r6, r5, d7 +; CHECK-NEXT: strb r1, [r4] +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: strb.w r1, [r12] +; CHECK-NEXT: vmov r1, r4, d2 +; CHECK-NEXT: strb r7, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[7] ; CHECK-NEXT: strb.w r0, [r8] -; CHECK-NEXT: vmov r0, r5, d3 +; CHECK-NEXT: vmov r0, r7, d3 +; CHECK-NEXT: strb r3, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: strb.w r2, [r9] +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: strb r2, [r6] +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: strb r2, [r5] +; CHECK-NEXT: vmov.u8 r2, q0[12] ; CHECK-NEXT: strb r2, [r1] -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: strb.w r1, [r9] -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: strb r1, [r3] -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: strb r1, [r6] -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r7] ; CHECK-NEXT: vmov.u8 r1, q0[13] ; CHECK-NEXT: strb r1, [r4] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: strb r0, [r5] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: strb r0, [r7] ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} %1 = add <16 x i32> %offs, @@ -285,10 +282,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: blt .LBB5_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -296,40 +291,33 @@ ; CHECK-NEXT: adr r4, .LCPI5_1 ; CHECK-NEXT: adr r5, .LCPI5_2 ; CHECK-NEXT: adr r6, .LCPI5_3 +; CHECK-NEXT: vldrw.u32 q2, [r4] ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vldrw.u32 q2, [r4] ; CHECK-NEXT: vldrw.u32 q3, [lr] ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: vadd.i32 q1, q1, r1 ; CHECK-NEXT: vadd.i32 q2, q2, r1 ; CHECK-NEXT: vadd.i32 q3, q3, r1 -; CHECK-NEXT: vmov.i32 q4, #0x3 -; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.i32 q4, #0x2 -; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.i32 q4, #0x1 -; CHECK-NEXT: vmov.i32 q7, #0x4 -; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill +; CHECK-NEXT: mov.w r12, #1 +; CHECK-NEXT: movs r4, #3 +; CHECK-NEXT: movs r3, #2 +; CHECK-NEXT: movs r1, #4 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q4, [r0], #16 -; CHECK-NEXT: vadd.i32 q6, q4, q6 -; CHECK-NEXT: vadd.i32 q5, q4, q7 +; CHECK-NEXT: vadd.i32 q6, q4, r12 +; CHECK-NEXT: vadd.i32 q5, q4, r1 ; CHECK-NEXT: vstrw.32 q6, [q3, #128]! -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q6, q4, q6 +; CHECK-NEXT: vadd.i32 q6, q4, r3 +; CHECK-NEXT: vadd.i32 q4, q4, r4 ; CHECK-NEXT: vstrw.32 q6, [q2, #128]! -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q4, q4, q6 ; CHECK-NEXT: vstrw.32 q4, [q1, #128]! ; CHECK-NEXT: vstrw.32 q5, [q0, #128]! ; CHECK-NEXT: letp lr, .LBB5_2 ; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #48 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -176,27 +176,27 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrh.s32 q2, [r1] -; CHECK-NEXT: vmov.i32 q1, #0x28 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: mov.w r12, #40 ; CHECK-NEXT: vmov.u16 r6, q0[0] -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q2, q2, q1 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: vmov r12, lr, d5 -; CHECK-NEXT: vldrh.s32 q2, [r1, #8] -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r12 +; CHECK-NEXT: vmov r3, r2, d2 +; CHECK-NEXT: vmov lr, r5, d3 +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r12 ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov r4, r5, d3 -; CHECK-NEXT: strh r6, [r2] -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: strh r2, [r3] +; CHECK-NEXT: vmov r4, r12, d3 +; CHECK-NEXT: strh r6, [r3] +; CHECK-NEXT: vmov.u16 r3, q0[1] +; CHECK-NEXT: strh r3, [r2] ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: strh.w r2, [r12] -; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: strh.w r2, [lr] +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: strh r2, [r5] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: strh r2, [r0] ; CHECK-NEXT: vmov.u16 r0, q0[5] @@ -204,7 +204,7 @@ ; CHECK-NEXT: vmov.u16 r0, q0[6] ; CHECK-NEXT: strh r0, [r4] ; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: strh r0, [r5] +; CHECK-NEXT: strh.w r0, [r12] ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll @@ -236,13 +236,13 @@ define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { ; CHECK-LABEL: ext_scaled_i16_i32_2gep: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xa -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vstrh.32 q0, [r2, q1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: movs r2, #10 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r2 +; CHECK-NEXT: vstrh.32 q0, [r3, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -420,65 +420,62 @@ define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { ; CHECK-LABEL: unscaled_v16i8_i8_2gep: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrb.s32 q2, [r1] -; CHECK-NEXT: vmov.i32 q1, #0x5 -; CHECK-NEXT: vldrb.s32 q4, [r1, #8] +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: movs r2, #5 +; CHECK-NEXT: vldrb.s32 q3, [r1, #8] ; CHECK-NEXT: vmov.u8 r6, q0[0] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov.u8 r5, q0[4] -; CHECK-NEXT: vadd.i32 q2, q2, q1 -; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: vmov.u8 r7, q0[6] -; CHECK-NEXT: vmov r12, lr, d5 -; CHECK-NEXT: vldrb.s32 q2, [r1, #4] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q3, q2, q1 -; CHECK-NEXT: vldrb.s32 q2, [r1, #12] -; CHECK-NEXT: vmov r4, r8, d6 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, r9, d7 -; CHECK-NEXT: vadd.i32 q3, q4, q1 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: strb r6, [r2] -; CHECK-NEXT: vmov.u8 r2, q0[1] -; CHECK-NEXT: strb r2, [r3] -; CHECK-NEXT: vmov.u8 r6, q0[2] -; CHECK-NEXT: vmov r2, r10, d6 -; CHECK-NEXT: strb.w r6, [r12] -; CHECK-NEXT: vmov.u8 r6, q0[3] -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: strb.w r6, [lr] -; CHECK-NEXT: vmov r6, r1, d7 -; CHECK-NEXT: strb r5, [r4] -; CHECK-NEXT: vmov.u8 r5, q0[5] -; CHECK-NEXT: strb.w r5, [r8] -; CHECK-NEXT: vmov r5, r4, d2 -; CHECK-NEXT: strb r7, [r0] -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: strb.w r0, [r9] -; CHECK-NEXT: vmov r0, r7, d3 -; CHECK-NEXT: strb r3, [r2] -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: strb.w r2, [r10] -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: strb r2, [r6] -; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov.u8 r4, q0[2] +; CHECK-NEXT: vadd.i32 q1, q1, r2 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vmov r3, r12, d2 +; CHECK-NEXT: vadd.i32 q3, q3, r2 +; CHECK-NEXT: vmov lr, r7, d3 +; CHECK-NEXT: vldrb.s32 q1, [r1, #4] +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q2, q1, r2 +; CHECK-NEXT: vldrb.s32 q1, [r1, #12] +; CHECK-NEXT: vmov r1, r5, d4 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, r8, d5 +; CHECK-NEXT: vadd.i32 q1, q1, r2 +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: strb r6, [r3] +; CHECK-NEXT: vmov.u8 r3, q0[1] +; CHECK-NEXT: strb.w r3, [r12] +; CHECK-NEXT: vmov r3, r12, d6 +; CHECK-NEXT: strb.w r4, [lr] +; CHECK-NEXT: vmov.u8 r4, q0[3] +; CHECK-NEXT: strb r4, [r7] +; CHECK-NEXT: vmov r7, r4, d7 +; CHECK-NEXT: vmov.u8 r6, q0[8] ; CHECK-NEXT: strb r2, [r1] -; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov.u8 r1, q0[5] ; CHECK-NEXT: strb r1, [r5] +; CHECK-NEXT: vmov.u8 r5, q0[6] +; CHECK-NEXT: vmov r1, r2, d2 +; CHECK-NEXT: strb r5, [r0] +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: strb.w r0, [r8] +; CHECK-NEXT: vmov r0, r5, d3 +; CHECK-NEXT: strb r6, [r3] +; CHECK-NEXT: vmov.u8 r3, q0[9] +; CHECK-NEXT: strb.w r3, [r12] +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: strb r3, [r7] +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: strb r3, [r4] +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: strb r3, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r4] +; CHECK-NEXT: strb r1, [r2] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] ; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: strb r0, [r7] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: strb r0, [r5] +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -586,8 +586,8 @@ define arm_aapcs_vfpcc void @qi4(<4 x i32> %v, <4 x i32*> %p) { ; CHECK-LABEL: qi4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q2, #0x10 -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vstrw.32 q0, [q1] ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-selectcc.ll b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll --- a/llvm/test/CodeGen/Thumb2/mve-selectcc.ll +++ b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll @@ -208,22 +208,22 @@ ; CHECK-LABEL: e: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: adr r0, .LCPI14_0 -; CHECK-NEXT: vmov.i32 q1, #0x4 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: movs r0, #4 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: .LBB14_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: vadd.i32 q2, q2, q1 -; CHECK-NEXT: cmp r0, #8 -; CHECK-NEXT: csetm r1, eq -; CHECK-NEXT: subs.w r2, r0, #8 -; CHECK-NEXT: vdup.32 q3, r1 -; CHECK-NEXT: csel r0, r0, r2, ne -; CHECK-NEXT: vbic q2, q2, q3 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vorr q2, q3, q2 +; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: cmp r1, #8 +; CHECK-NEXT: csetm r2, eq +; CHECK-NEXT: subs.w r3, r1, #8 +; CHECK-NEXT: vdup.32 q2, r2 +; CHECK-NEXT: csel r1, r1, r3, ne +; CHECK-NEXT: vbic q1, q1, q2 +; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vorr q1, q2, q1 ; CHECK-NEXT: b .LBB14_1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.2: diff --git a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll --- a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -9,10 +9,10 @@ ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: vmov.i16 q3, #0x1 +; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q2, q2, q3 -; CHECK-NEXT: vadd.i16 q0, q0, q3 +; CHECK-NEXT: vadd.i16 q2, q2, r0 +; CHECK-NEXT: vadd.i16 q0, q0, r0 ; CHECK-NEXT: vshr.u16 q2, q2, #1 ; CHECK-NEXT: vshr.u16 q0, q0, #1 ; CHECK-NEXT: vmovnt.i16 q0, q2 @@ -34,10 +34,10 @@ ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov.i32 q3, #0x1 +; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vadd.i32 q0, q0, q3 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vshr.u32 q2, q2, #1 ; CHECK-NEXT: vshr.u32 q0, q0, #1 ; CHECK-NEXT: vmovnt.i32 q0, q2 @@ -204,10 +204,10 @@ ; CHECK-NEXT: vmovlb.u8 q1, q1 ; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: vmov.i16 q3, #0x1 +; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q2, q2, q3 -; CHECK-NEXT: vadd.i16 q0, q0, q3 +; CHECK-NEXT: vadd.i16 q2, q2, r0 +; CHECK-NEXT: vadd.i16 q0, q0, r0 ; CHECK-NEXT: vshr.u16 q2, q2, #1 ; CHECK-NEXT: vshr.u16 q0, q0, #1 ; CHECK-NEXT: vmovnt.i16 q0, q2 @@ -229,10 +229,10 @@ ; CHECK-NEXT: vmovlb.u16 q1, q1 ; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov.i32 q3, #0x1 +; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vadd.i32 q0, q0, q3 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vshr.u32 q2, q2, #1 ; CHECK-NEXT: vshr.u32 q0, q0, #1 ; CHECK-NEXT: vmovnt.i32 q0, q2 @@ -745,21 +745,21 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 -; CHECK-NEXT: vmov.i16 q0, #0x1 +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: .LBB18_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q1, [r1, #8] -; CHECK-NEXT: vldrb.u16 q2, [r0, #8] -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vldrb.u16 q2, [r0], #16 -; CHECK-NEXT: vadd.i16 q1, q1, q0 -; CHECK-NEXT: vshr.u16 q1, q1, #1 -; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vldrb.u16 q1, [r1], #16 -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q1, q1, q0 -; CHECK-NEXT: vshr.u16 q1, q1, #1 -; CHECK-NEXT: vstrb.16 q1, [r2], #16 +; CHECK-NEXT: vldrb.u16 q0, [r1, #8] +; CHECK-NEXT: vldrb.u16 q1, [r0, #8] +; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vldrb.u16 q1, [r0], #16 +; CHECK-NEXT: vadd.i16 q0, q0, r3 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: vstrb.16 q0, [r2, #8] +; CHECK-NEXT: vldrb.u16 q0, [r1], #16 +; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vadd.i16 q0, q0, r3 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: vstrb.16 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB18_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -797,21 +797,21 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: vmov.i32 q0, #0x1 +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: .LBB19_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vldrh.u32 q2, [r0, #8] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.u32 q2, [r0], #16 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vshr.u32 q1, q1, #1 -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vldrh.u32 q1, [r1], #16 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vshr.u32 q1, q1, #1 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vldrh.u32 q0, [r1, #8] +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0], #16 +; CHECK-NEXT: vadd.i32 q0, q0, r3 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vstrh.32 q0, [r2, #8] +; CHECK-NEXT: vldrh.u32 q0, [r1], #16 +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vadd.i32 q0, q0, r3 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vstrh.32 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB19_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -935,21 +935,21 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 -; CHECK-NEXT: vmov.i16 q0, #0x1 +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: .LBB21_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q1, [r1, #8] -; CHECK-NEXT: vldrb.u16 q2, [r0, #8] -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vldrb.u16 q2, [r0], #16 -; CHECK-NEXT: vadd.i16 q1, q1, q0 -; CHECK-NEXT: vshr.u16 q1, q1, #1 -; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vldrb.u16 q1, [r1], #16 -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q1, q1, q0 -; CHECK-NEXT: vshr.u16 q1, q1, #1 -; CHECK-NEXT: vstrb.16 q1, [r2], #16 +; CHECK-NEXT: vldrb.u16 q0, [r1, #8] +; CHECK-NEXT: vldrb.u16 q1, [r0, #8] +; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vldrb.u16 q1, [r0], #16 +; CHECK-NEXT: vadd.i16 q0, q0, r3 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: vstrb.16 q0, [r2, #8] +; CHECK-NEXT: vldrb.u16 q0, [r1], #16 +; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vadd.i16 q0, q0, r3 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: vstrb.16 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB21_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -987,21 +987,21 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: vmov.i32 q0, #0x1 +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: .LBB22_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vldrh.u32 q2, [r0, #8] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.u32 q2, [r0], #16 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vshr.u32 q1, q1, #1 -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vldrh.u32 q1, [r1], #16 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vshr.u32 q1, q1, #1 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vldrh.u32 q0, [r1, #8] +; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrh.u32 q1, [r0], #16 +; CHECK-NEXT: vadd.i32 q0, q0, r3 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vstrh.32 q0, [r2, #8] +; CHECK-NEXT: vldrh.u32 q0, [r1], #16 +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vadd.i32 q0, q0, r3 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vstrh.32 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB22_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc}