Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -2343,6 +2343,15 @@ return false; } + /// Given a shuffle vector SVI representing a vector splat, return a new + /// scalar type of size equal to SVI's scalar type if the new type is more + /// profitable. Returns nullptr otherwise. For example under MVE float splats + /// are converted to integer to prevent the need to move from SPR to GPR + /// registers. + virtual Type* shouldConvertSplatType(ShuffleVectorInst* SVI) const { + return nullptr; + } + /// Returns true if the opcode is a commutative binary operation. virtual bool isCommutativeBinOp(unsigned Opcode) const { // FIXME: This should get its info from the td file. Index: llvm/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -392,6 +392,8 @@ bool optimizeLoadExt(LoadInst *Load); bool optimizeShiftInst(BinaryOperator *BO); bool optimizeSelectInst(SelectInst *SI); + bool sinkShuffleVectorToShift(ShuffleVectorInst *SVI); + bool convertSplatType(ShuffleVectorInst *SVI); bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI); bool optimizeSwitchInst(SwitchInst *SI); bool optimizeExtractElementInst(Instruction *Inst); @@ -6431,7 +6433,7 @@ /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases /// it's often worth sinking a shufflevector splat down to its use so that /// codegen can spot all lanes are identical. -bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { +bool CodeGenPrepare::sinkShuffleVectorToShift(ShuffleVectorInst *SVI) { BasicBlock *DefBB = SVI->getParent(); // Only do this xform if variable vector shifts are particularly expensive. @@ -6483,6 +6485,58 @@ return MadeChange; } +/// Some targets only accept certain types for splat inputs. For example a VDUP +/// in MVE takes a GPR (integer) register, and the instruction that incorporate +/// a VDUP (such as a VADD qd, qm, rm) also require a gpr register. +bool CodeGenPrepare::convertSplatType(ShuffleVectorInst *SVI) { + if (!match(SVI, + m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), + m_Undef(), m_ZeroMask()))) + return false; + Type *NewType = TLI->shouldConvertSplatType(SVI); + if (!NewType) + return false; + + VectorType *SVIVecType = cast(SVI->getType()); + Type *SVIType = SVIVecType->getScalarType(); + assert(!NewType->isVectorTy() && "Expected a scalar type!"); + assert(NewType->getScalarSizeInBits() == SVIType->getScalarSizeInBits() && + "Expected a type of the same size!"); + Type *NewVecType = VectorType::get(NewType, SVIVecType->getNumElements()); + + // Create a bitcast (shuffle (insert (bitcast(..)))) + IRBuilder<> Builder(SVI->getContext()); + Builder.SetInsertPoint(SVI); + Value *BC1 = Builder.CreateBitCast( + cast(SVI->getOperand(0))->getOperand(1), NewType); + Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1, + (uint64_t)0); + Value *Shuffle = Builder.CreateShuffleVector( + Insert, UndefValue::get(NewVecType), SVI->getShuffleMask()); + Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType); + + SVI->replaceAllUsesWith(BC2); + RecursivelyDeleteTriviallyDeadInstructions(SVI); + + // Also hoist the bitcast up to its operand if it they are not in the same + // block. + if (auto *BCI = dyn_cast(BC1)) + if (auto *Op = dyn_cast(BCI->getOperand(0))) + if (BCI->getParent() != Op->getParent() && !isa(Op) && + !Op->isTerminator() && !Op->isEHPad()) + BCI->moveAfter(Op); + + return true; +} + +bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { + if (sinkShuffleVectorToShift(SVI)) + return true; + if (convertSplatType(SVI)) + return true; + return false; +} + bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { // If the operands of I can be folded into a target instruction together with // I, duplicate and sink them. Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -384,6 +384,7 @@ bool isZExtFree(SDValue Val, EVT VT2) const override; bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops) const override; + Type* shouldConvertSplatType(ShuffleVectorInst* SVI) const override; bool isFNegFree(EVT VT) const override; Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15750,32 +15750,52 @@ }; for (auto OpIdx : enumerate(I->operands())) { - Value *Op = OpIdx.value().get(); + Instruction *Op = dyn_cast(OpIdx.value().get()); // Make sure we are not already sinking this operand - if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) + if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) continue; + + Instruction *Shuffle = Op; + if (Shuffle->getOpcode() == Instruction::BitCast) + Shuffle = dyn_cast(Shuffle->getOperand(0)); // We are looking for a splat that can be sunk. - if (!match(Op, m_ShuffleVector( - m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), - m_Undef(), m_ZeroMask()))) + if (!Shuffle || + !match(Shuffle, m_ShuffleVector( + m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), + m_Undef(), m_ZeroMask()))) continue; if (!IsSinker(I, OpIdx.index())) continue; - Instruction *Shuffle = cast(Op); // All uses of the shuffle should be sunk to avoid duplicating it across gpr // and vector registers - for (Use &U : Shuffle->uses()) { + for (Use &U : Op->uses()) { Instruction *Insn = cast(U.getUser()); if (!IsSinker(Insn, U.getOperandNo())) return false; } + Ops.push_back(&Shuffle->getOperandUse(0)); + if (Shuffle != Op) + Ops.push_back(&Op->getOperandUse(0)); Ops.push_back(&OpIdx.value()); } return true; } +Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const { + if (!Subtarget->hasMVEIntegerOps()) + return nullptr; + Type *SVIType = SVI->getType(); + Type *ScalarType = SVIType->getScalarType(); + + if (ScalarType->isFloatTy()) + return Type::getInt32Ty(SVIType->getContext()); + if (ScalarType->isHalfTy()) + return Type::getInt16Ty(SVIType->getContext()); + return nullptr; +} + bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT VT = ExtVal.getValueType(); Index: llvm/test/CodeGen/Thumb2/mve-float16regloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -7,8 +7,7 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -54,8 +53,7 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -101,8 +99,7 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -148,8 +145,7 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -195,8 +191,7 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -242,8 +237,7 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vdup.16 q0, r1 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -292,8 +286,7 @@ ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -345,8 +338,7 @@ ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -398,8 +390,7 @@ ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -451,8 +442,7 @@ ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 @@ -505,8 +495,7 @@ ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: vneg.f16 q0, q0 ; CHECK-NEXT: .LBB10_1: @ %vector.body @@ -561,8 +550,7 @@ ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB11_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -616,8 +604,7 @@ ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: .LBB12_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -670,8 +657,7 @@ ; CHECK-NEXT: cmp.w r12, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: .LBB13_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -819,39 +805,35 @@ ; CHECK-NEXT: ldr r7, [r0, #8] ; CHECK-NEXT: add.w r4, r12, r6, lsl #1 ; CHECK-NEXT: lsr.w lr, r3, #2 -; CHECK-NEXT: vldr.16 s0, [r7, #6] -; CHECK-NEXT: vldr.16 s2, [r7, #4] -; CHECK-NEXT: vldr.16 s4, [r7, #2] -; CHECK-NEXT: vldr.16 s6, [r7] +; CHECK-NEXT: ldrh.w r8, [r7, #6] +; CHECK-NEXT: ldrh.w r9, [r7, #4] +; CHECK-NEXT: ldrh r6, [r7, #2] +; CHECK-NEXT: ldrh r7, [r7] ; CHECK-NEXT: wls lr, lr, .LBB15_5 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph -; CHECK-NEXT: vmov.f16 r11, s6 ; CHECK-NEXT: str r5, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: vmov.f16 r10, s4 ; CHECK-NEXT: bic r5, r3, #3 -; CHECK-NEXT: vmov.f16 r7, s2 -; CHECK-NEXT: add.w r6, r12, #2 -; CHECK-NEXT: vmov.f16 r8, s0 +; CHECK-NEXT: add.w r10, r12, #2 ; CHECK-NEXT: str r5, [sp] @ 4-byte Spill ; CHECK-NEXT: add.w r5, r2, r5, lsl #1 ; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: .LBB15_3: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r1], #8 -; CHECK-NEXT: sub.w r9, r6, #2 -; CHECK-NEXT: adds r5, r6, #2 -; CHECK-NEXT: vstrb.8 q2, [r4], #8 -; CHECK-NEXT: vldrw.u32 q2, [r9] -; CHECK-NEXT: vldrw.u32 q3, [r6] -; CHECK-NEXT: vmul.f16 q2, q2, r11 -; CHECK-NEXT: vfma.f16 q2, q3, r10 -; CHECK-NEXT: vldrw.u32 q3, [r5] -; CHECK-NEXT: vfma.f16 q2, q3, r7 -; CHECK-NEXT: vldrw.u32 q3, [r6, #4] -; CHECK-NEXT: adds r6, #8 -; CHECK-NEXT: vfma.f16 q2, q3, r8 -; CHECK-NEXT: vstrb.8 q2, [r2], #8 +; CHECK-NEXT: vldrw.u32 q0, [r1], #8 +; CHECK-NEXT: sub.w r11, r10, #2 +; CHECK-NEXT: add.w r5, r10, #2 +; CHECK-NEXT: vstrb.8 q0, [r4], #8 +; CHECK-NEXT: vldrw.u32 q0, [r11] +; CHECK-NEXT: vldrw.u32 q1, [r10] +; CHECK-NEXT: vmul.f16 q0, q0, r7 +; CHECK-NEXT: vfma.f16 q0, q1, r6 +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: vfma.f16 q0, q1, r9 +; CHECK-NEXT: vldrw.u32 q1, [r10, #4] +; CHECK-NEXT: add.w r10, r10, #8 +; CHECK-NEXT: vfma.f16 q0, q1, r8 +; CHECK-NEXT: vstrb.8 q0, [r2], #8 ; CHECK-NEXT: le lr, .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload @@ -861,27 +843,23 @@ ; CHECK-NEXT: add.w r1, r1, r2, lsl #1 ; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB15_5: @ %while.end -; CHECK-NEXT: and r7, r3, #3 -; CHECK-NEXT: vldrw.u32 q2, [r1] -; CHECK-NEXT: vctp.16 r7 +; CHECK-NEXT: and lr, r3, #3 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vctp.16 lr ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q2, [r4] -; CHECK-NEXT: vldrw.u32 q2, [r12] -; CHECK-NEXT: vmov.f16 r1, s6 -; CHECK-NEXT: add.w r7, r12, #2 -; CHECK-NEXT: vmul.f16 q2, q2, r1 -; CHECK-NEXT: vmov.f16 r1, s4 -; CHECK-NEXT: vldrw.u32 q1, [r7] -; CHECK-NEXT: add.w r7, r12, #6 -; CHECK-NEXT: vfma.f16 q2, q1, r1 +; CHECK-NEXT: vstrht.16 q0, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r12] +; CHECK-NEXT: add.w r1, r12, #2 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: add.w r1, r12, #6 +; CHECK-NEXT: vmul.f16 q0, q0, r7 +; CHECK-NEXT: vfma.f16 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q1, [r12, #4] -; CHECK-NEXT: vmov.f16 r1, s2 -; CHECK-NEXT: vfma.f16 q2, q1, r1 -; CHECK-NEXT: vmov.f16 r1, s0 -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: vfma.f16 q2, q0, r1 +; CHECK-NEXT: vfma.f16 q0, q1, r9 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vfma.f16 q0, q1, r8 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q2, [r2] +; CHECK-NEXT: vstrht.16 q0, [r2] ; CHECK-NEXT: ldr.w r12, [r0, #4] ; CHECK-NEXT: .LBB15_6: @ %if.end ; CHECK-NEXT: add.w r0, r12, r3, lsl #1 Index: llvm/test/CodeGen/Thumb2/mve-float32regloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -771,84 +771,70 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldrh.w r10, [r0] -; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: ldrh.w r9, [r0] +; CHECK-NEXT: mov r11, r1 ; CHECK-NEXT: ldr.w r12, [r0, #4] -; CHECK-NEXT: sub.w r1, r10, #1 +; CHECK-NEXT: sub.w r1, r9, #1 ; CHECK-NEXT: cmp r1, #3 ; CHECK-NEXT: bhi .LBB15_6 ; CHECK-NEXT: @ %bb.1: @ %if.then -; CHECK-NEXT: ldr r7, [r0, #8] -; CHECK-NEXT: add.w r4, r12, r1, lsl #2 +; CHECK-NEXT: ldr r4, [r0, #8] ; CHECK-NEXT: lsr.w lr, r3, #2 -; CHECK-NEXT: vldr s0, [r7] -; CHECK-NEXT: vldr s2, [r7, #4] -; CHECK-NEXT: vldr s4, [r7, #8] -; CHECK-NEXT: vldr s6, [r7, #12] +; CHECK-NEXT: ldrd r7, r6, [r4] +; CHECK-NEXT: ldrd r5, r8, [r4, #8] +; CHECK-NEXT: add.w r4, r12, r1, lsl #2 ; CHECK-NEXT: wls lr, lr, .LBB15_5 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph -; CHECK-NEXT: vmov r11, s4 ; CHECK-NEXT: bic r1, r3, #3 -; CHECK-NEXT: vmov r5, s6 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill -; CHECK-NEXT: vmov r7, s2 +; CHECK-NEXT: add.w r10, r12, #4 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vmov r8, s0 -; CHECK-NEXT: add.w r6, r12, #4 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: .LBB15_3: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vstrb.8 q2, [r4], #16 -; CHECK-NEXT: vldrw.u32 q2, [r6, #-4] -; CHECK-NEXT: vldrw.u32 q3, [r6], #16 -; CHECK-NEXT: vmul.f32 q2, q2, r8 -; CHECK-NEXT: vldrw.u32 q4, [r6, #-8] -; CHECK-NEXT: vfma.f32 q2, q3, r7 -; CHECK-NEXT: vldrw.u32 q3, [r6, #-12] -; CHECK-NEXT: vfma.f32 q2, q3, r11 -; CHECK-NEXT: vfma.f32 q2, q4, r5 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vstrb.8 q0, [r4], #16 +; CHECK-NEXT: vldrw.u32 q0, [r10, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r10], #16 +; CHECK-NEXT: vmul.f32 q0, q0, r7 +; CHECK-NEXT: vldrw.u32 q2, [r10, #-8] +; CHECK-NEXT: vfma.f32 q0, q1, r6 +; CHECK-NEXT: vldrw.u32 q1, [r10, #-12] +; CHECK-NEXT: vfma.f32 q0, q1, r5 +; CHECK-NEXT: vfma.f32 q0, q2, r8 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: add.w r12, r12, r1, lsl #2 -; CHECK-NEXT: add.w r9, r9, r1, lsl #2 +; CHECK-NEXT: add.w r11, r11, r1, lsl #2 ; CHECK-NEXT: .LBB15_5: @ %while.end -; CHECK-NEXT: and r6, r3, #3 -; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r7, s4 -; CHECK-NEXT: vldrw.u32 q1, [r9] -; CHECK-NEXT: vctp.32 r6 +; CHECK-NEXT: and r1, r3, #3 +; CHECK-NEXT: vldrw.u32 q0, [r11] +; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q1, [r4] -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: vldrw.u32 q1, [r12, #4] -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vstrwt.32 q0, [r4] ; CHECK-NEXT: vldrw.u32 q0, [r12] -; CHECK-NEXT: vmul.f32 q0, q0, r1 -; CHECK-NEXT: vfma.f32 q0, q1, r5 +; CHECK-NEXT: vldrw.u32 q1, [r12, #4] +; CHECK-NEXT: vmul.f32 q0, q0, r7 +; CHECK-NEXT: vfma.f32 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q1, [r12, #8] -; CHECK-NEXT: vfma.f32 q0, q1, r7 +; CHECK-NEXT: vfma.f32 q0, q1, r5 ; CHECK-NEXT: vldrw.u32 q1, [r12, #12] -; CHECK-NEXT: vfma.f32 q0, q1, lr +; CHECK-NEXT: vfma.f32 q0, q1, r8 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r2] ; CHECK-NEXT: ldr.w r12, [r0, #4] ; CHECK-NEXT: .LBB15_6: @ %if.end ; CHECK-NEXT: add.w r0, r12, r3, lsl #2 -; CHECK-NEXT: lsr.w lr, r10, #2 +; CHECK-NEXT: lsr.w lr, r9, #2 ; CHECK-NEXT: wls lr, lr, .LBB15_10 ; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader -; CHECK-NEXT: bic r2, r10, #3 +; CHECK-NEXT: bic r2, r9, #3 ; CHECK-NEXT: adds r1, r2, r3 ; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: add.w r1, r12, r1, lsl #2 @@ -861,7 +847,7 @@ ; CHECK-NEXT: add.w r12, r12, r2, lsl #2 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: .LBB15_10: @ %while.end55 -; CHECK-NEXT: ands r1, r10, #3 +; CHECK-NEXT: ands r1, r9, #3 ; CHECK-NEXT: beq .LBB15_12 ; CHECK-NEXT: @ %bb.11: @ %if.then59 ; CHECK-NEXT: vldrw.u32 q0, [r0] @@ -870,8 +856,6 @@ ; CHECK-NEXT: vstrwt.32 q0, [r12] ; CHECK-NEXT: .LBB15_12: @ %if.end61 ; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 @@ -1410,8 +1394,8 @@ ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: .pad #24 ; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: ldrb.w lr, [r0] @@ -1426,39 +1410,39 @@ ; CHECK-NEXT: mov r4, sp ; CHECK-NEXT: .LBB17_2: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB17_3 Depth 2 -; CHECK-NEXT: vldr s8, [r0, #4] -; CHECK-NEXT: vldrw.u32 q4, [r12] -; CHECK-NEXT: vldr s12, [r0, #12] ; CHECK-NEXT: mov r7, lr -; CHECK-NEXT: vldr s10, [r0, #8] -; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: ldr.w lr, [r0, #12] +; CHECK-NEXT: ldrd r5, r6, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vldr s12, [r0, #8] +; CHECK-NEXT: vdup.32 q2, lr ; CHECK-NEXT: vldr s14, [r0, #16] -; CHECK-NEXT: vmov.f32 s9, s8 -; CHECK-NEXT: vmov.f32 s13, s12 -; CHECK-NEXT: vldr s4, [r0] -; CHECK-NEXT: vmov.f32 s15, s14 -; CHECK-NEXT: vstrw.32 q4, [r4] -; CHECK-NEXT: vmov.f32 s11, s10 +; CHECK-NEXT: vstrw.32 q1, [r4] +; CHECK-NEXT: vdup.32 q1, r6 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: vmov.f32 s6, s12 ; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s7, s12 +; CHECK-NEXT: vmov.f32 s11, s14 ; CHECK-NEXT: .LBB17_3: @ Parent Loop BB17_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q5, [r1, q0, uxtw #2] -; CHECK-NEXT: vldrw.u32 q6, [r4, q0, uxtw #2] -; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vldrw.u32 q4, [r1, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q5, [r4, q0, uxtw #2] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vfma.f32 q6, q5, r6 -; CHECK-NEXT: vstmia r5, {s24, s25} -; CHECK-NEXT: adds r5, #8 -; CHECK-NEXT: vldrw.u32 q4, [sp, #8] -; CHECK-NEXT: vfma.f32 q4, q6, q3 -; CHECK-NEXT: vfma.f32 q4, q5, q2 -; CHECK-NEXT: vstrw.32 q4, [r4] +; CHECK-NEXT: vfma.f32 q5, q4, r5 +; CHECK-NEXT: vstmia r6, {s20, s21} +; CHECK-NEXT: adds r6, #8 +; CHECK-NEXT: vldrw.u32 q3, [sp, #8] +; CHECK-NEXT: vfma.f32 q3, q5, q2 +; CHECK-NEXT: vfma.f32 q3, q4, q1 +; CHECK-NEXT: vstrw.32 q3, [r4] ; CHECK-NEXT: le lr, .LBB17_3 ; CHECK-NEXT: @ %bb.4: @ in Loop: Header=BB17_2 Depth=1 ; CHECK-NEXT: mov lr, r7 ; CHECK-NEXT: adds r0, #20 ; CHECK-NEXT: subs.w lr, r7, #1 -; CHECK-NEXT: vstrb.8 q4, [r12], #16 +; CHECK-NEXT: vstrb.8 q3, [r12], #16 ; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: bne .LBB17_2 ; CHECK-NEXT: b .LBB17_7 @@ -1470,7 +1454,7 @@ ; CHECK-NEXT: le lr, .LBB17_6 ; CHECK-NEXT: .LBB17_7: ; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} %5 = alloca [6 x float], align 4 Index: llvm/test/CodeGen/Thumb2/mve-fma-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-fma-loops.ll +++ llvm/test/CodeGen/Thumb2/mve-fma-loops.ll @@ -239,9 +239,9 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vneg.f32 s0, s0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: eor r12, r12, #-2147483648 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -479,9 +479,9 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} -; CHECK-NEXT: vneg.f32 s0, s0 -; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: eor r12, r12, #-2147483648 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/dup.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/dup.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/dup.ll @@ -4,7 +4,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vdupq_n_f16(float %a.coerce) { ; CHECK-LABEL: test_vdupq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r0, s0 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vdup.16 q0, r0 ; CHECK-NEXT: bx lr entry: @@ -97,7 +97,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vdupq_m_n_f16(<8 x half> %inactive, float %a.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vdupq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r1, s4 +; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vdupt.16 q0, r1 Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll @@ -24,7 +24,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) { ; CHECK-LABEL: test_vfmaq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r0, s8 +; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: bx lr entry: @@ -53,7 +53,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vfmasq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) { ; CHECK-LABEL: test_vfmasq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r0, s8 +; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vfmas.f16 q0, q1, r0 ; CHECK-NEXT: bx lr entry: @@ -422,7 +422,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vfmaq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r1, s8 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f16 q0, q1, r1 @@ -459,7 +459,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vfmasq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vfmasq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r1, s8 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmast.f16 q0, q1, r1 Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll @@ -106,7 +106,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vaddq_n_f16(<8 x half> %a, float %b.coerce) { ; CHECK-LABEL: test_vaddq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r0, s4 +; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vadd.f16 q0, q0, r0 ; CHECK-NEXT: bx lr entry: @@ -171,7 +171,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vaddq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vaddq_x_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r1, s4 +; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vaddt.f16 q0, q0, r1 Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll @@ -269,7 +269,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_n_f16(<8 x half> %inactive, <8 x half> %a, float %b.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vmulq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r1, s8 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vmult.f16 q0, q1, r1 Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll +++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll @@ -106,7 +106,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vsubq_n_f16(<8 x half> %a, float %b.coerce) { ; CHECK-LABEL: test_vsubq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r0, s4 +; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vsub.f16 q0, q0, r0 ; CHECK-NEXT: bx lr entry: @@ -171,7 +171,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vsubq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vsubq_x_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f16 r1, s4 +; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vsubt.f16 q0, q0, r1 Index: llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -1164,32 +1164,32 @@ ; CHECK-NEXT: lsr.w lr, r0, #3 ; CHECK-NEXT: wls lr, lr, .LBB7_12 ; CHECK-NEXT: @ %bb.10: -; CHECK-NEXT: vldrw.u32 q3, [q1, #16] ; CHECK-NEXT: vldr s0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vldrw.u32 q0, [q1, #16] ; CHECK-NEXT: .LBB7_11: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [q1, #24] +; CHECK-NEXT: vldrw.u32 q3, [q1, #24] ; CHECK-NEXT: vldrw.u32 q4, [q1, #8] -; CHECK-NEXT: vadd.f32 q6, q2, q3 -; CHECK-NEXT: vsub.f32 q2, q2, q3 -; CHECK-NEXT: vadd.f32 q5, q4, q0 -; CHECK-NEXT: vsub.f32 q0, q4, q0 -; CHECK-NEXT: vsub.f32 q7, q6, q5 -; CHECK-NEXT: vcadd.f32 q4, q2, q0, #270 +; CHECK-NEXT: vsub.f32 q6, q2, q0 +; CHECK-NEXT: vadd.f32 q0, q2, q0 +; CHECK-NEXT: vsub.f32 q5, q4, q3 +; CHECK-NEXT: vadd.f32 q3, q4, q3 +; CHECK-NEXT: vcadd.f32 q7, q6, q5, #270 +; CHECK-NEXT: vsub.f32 q2, q0, q3 +; CHECK-NEXT: vmul.f32 q7, q7, r0 +; CHECK-NEXT: vadd.f32 q3, q0, q3 ; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vcadd.f32 q7, q2, q0, #90 -; CHECK-NEXT: vadd.f32 q0, q6, q5 +; CHECK-NEXT: vcadd.f32 q7, q6, q5, #90 +; CHECK-NEXT: vmul.f32 q4, q2, r0 ; CHECK-NEXT: vldrw.u32 q2, [q1, #64]! -; CHECK-NEXT: vmul.f32 q0, q0, r0 -; CHECK-NEXT: vldrw.u32 q3, [q1, #16] -; CHECK-NEXT: vstrw.32 q0, [q1, #-64] -; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmul.f32 q0, q4, r0 -; CHECK-NEXT: vmul.f32 q4, q7, r0 -; CHECK-NEXT: vmul.f32 q5, q5, r0 -; CHECK-NEXT: vstrw.32 q5, [q1, #-56] -; CHECK-NEXT: vstrw.32 q4, [q1, #-48] -; CHECK-NEXT: vstrw.32 q0, [q1, #-40] +; CHECK-NEXT: vmul.f32 q5, q7, r0 +; CHECK-NEXT: vmul.f32 q3, q3, r0 +; CHECK-NEXT: vldrw.u32 q0, [q1, #16] +; CHECK-NEXT: vstrw.32 q3, [q1, #-64] +; CHECK-NEXT: vstrw.32 q4, [q1, #-56] +; CHECK-NEXT: vstrw.32 q5, [q1, #-48] +; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [q1, #-40] ; CHECK-NEXT: le lr, .LBB7_11 ; CHECK-NEXT: .LBB7_12: ; CHECK-NEXT: add sp, #56 Index: llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll @@ -174,19 +174,18 @@ ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: vneg.f32 s2, s0 ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: eor r2, r1, #-2147483648 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vpte.f32 ge, q1, r2 -; CHECK-NEXT: vcmpt.f32 le, q1, r1 +; CHECK-NEXT: vpte.f32 ge, q1, r1 +; CHECK-NEXT: vcmpt.f32 le, q1, r2 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup @@ -231,13 +230,13 @@ ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mvn r2, #7 -; CHECK-NEXT: add.w r1, r2, r1, lsl #3 -; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: vneg.f16 s2, s0 -; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov.f16 r1, s2 -; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: mvn r3, #7 +; CHECK-NEXT: add.w r1, r3, r1, lsl #3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vneg.f16 s0, s0 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r1, lsr #3 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -458,19 +457,18 @@ ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: vneg.f32 s2, s0 ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: eor r2, r1, #-2147483648 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vpte.f32 ge, q1, r2 -; CHECK-NEXT: vcmpt.f32 le, q1, r1 +; CHECK-NEXT: vpte.f32 ge, q1, r1 +; CHECK-NEXT: vcmpt.f32 le, q1, r2 ; CHECK-NEXT: vstrwe.32 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup @@ -515,13 +513,13 @@ ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mvn r2, #7 -; CHECK-NEXT: add.w r1, r2, r1, lsl #3 -; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: vneg.f16 s2, s0 -; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov.f16 r1, s2 -; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: mvn r3, #7 +; CHECK-NEXT: add.w r1, r3, r1, lsl #3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vneg.f16 s0, s0 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r1, lsr #3 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 Index: llvm/test/CodeGen/Thumb2/mve-vldst4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -4,232 +4,234 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 { ; CHECK-LABEL: vldst4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 -; CHECK-NEXT: muls r2, r3, r2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: cmp.w r3, r2, lsr #2 +; CHECK-NEXT: .pad #80 +; CHECK-NEXT: sub sp, #80 +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: cmp.w r2, r12, lsr #2 ; CHECK-NEXT: beq.w .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: vldr.16 s0, [sp, #160] ; CHECK-NEXT: mvn r3, #7 -; CHECK-NEXT: and.w r2, r3, r2, lsr #2 +; CHECK-NEXT: ldr r5, [sp, #160] +; CHECK-NEXT: and.w r3, r3, r12, lsr #2 +; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vmov.f16 r12, s0 -; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: add.w lr, r3, r2, lsr #3 +; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q4, [r0, #32] +; CHECK-NEXT: vldrh.u16 q5, [r0, #32] ; CHECK-NEXT: vldrh.u16 q3, [r0, #48] ; CHECK-NEXT: vldrh.u16 q7, [r0], #64 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmovx.f16 s8, s13 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmovx.f16 s8, s12 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov r3, s22 ; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov r3, s29 -; CHECK-NEXT: vldrh.u16 q5, [r0, #-48] -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov r2, s28 +; CHECK-NEXT: vldrh.u16 q6, [r0, #-48] +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov r3, s30 +; CHECK-NEXT: vmov.16 q1[1], r3 +; CHECK-NEXT: vmov r2, s24 ; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov r2, s26 ; CHECK-NEXT: vmov.16 q1[3], r2 ; CHECK-NEXT: vmov.f32 s6, s2 ; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmul.f16 q0, q1, r12 -; CHECK-NEXT: vmovx.f16 s4, s21 -; CHECK-NEXT: vmov q6, q0 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s31 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmul.f16 q0, q1, r5 +; CHECK-NEXT: vmovx.f16 s4, s24 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s29 +; CHECK-NEXT: vmovx.f16 s0, s28 ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov.16 q0[0], r4 ; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s19 +; CHECK-NEXT: vmovx.f16 s4, s22 ; CHECK-NEXT: vmov.16 q0[2], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s17 +; CHECK-NEXT: vmovx.f16 s4, s20 ; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: vmov.16 q1[4], r4 ; CHECK-NEXT: vmov.16 q1[5], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s15 +; CHECK-NEXT: vmovx.f16 s8, s14 ; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s23 +; CHECK-NEXT: vmovx.f16 s8, s26 ; CHECK-NEXT: vmov.16 q1[7], r2 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmovx.f16 s8, s13 ; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmul.f16 q0, q0, r12 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmul.f16 q0, q0, r5 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s25 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov.16 q1[1], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s8, s12 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov r2, s21 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r2, s13 ; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: vmov r3, s28 -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov r3, s29 +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vmov r2, s31 ; CHECK-NEXT: vmov.16 q1[0], r3 ; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r2, s25 ; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r2, s15 ; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: vmov r2, s27 ; CHECK-NEXT: vmov.16 q1[3], r2 ; CHECK-NEXT: vmov.f32 s6, s2 ; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vmovx.f16 s0, s31 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s28 +; CHECK-NEXT: vmovx.f16 s0, s29 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmul.f16 q6, q1, r12 +; CHECK-NEXT: vmul.f16 q4, q1, r5 ; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: vmovx.f16 s4, s20 +; CHECK-NEXT: vmovx.f16 s4, s25 ; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s18 +; CHECK-NEXT: vmovx.f16 s4, s23 ; CHECK-NEXT: vmov.16 q0[2], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s16 +; CHECK-NEXT: vmovx.f16 s4, s21 ; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q1[4], r4 -; CHECK-NEXT: vmov r3, s25 +; CHECK-NEXT: vmov r3, s16 ; CHECK-NEXT: vmov.16 q1[5], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vmovx.f16 s8, s15 ; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s22 +; CHECK-NEXT: vmovx.f16 s8, s27 ; CHECK-NEXT: vmov.16 q1[7], r2 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov.16 q5[0], r3 +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmov.16 q1[2], r3 +; CHECK-NEXT: vmul.f16 q6, q0, r5 +; CHECK-NEXT: vmovx.f16 s0, s16 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s24 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmul.f16 q2, q0, r12 -; CHECK-NEXT: vmovx.f16 s0, s25 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov.16 q5[1], r3 +; CHECK-NEXT: vmov r3, s25 ; CHECK-NEXT: vmov.16 q5[4], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 ; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov r2, s17 ; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vmovx.f16 s0, s17 ; CHECK-NEXT: vmov.16 q3[3], r3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vmovx.f16 s0, s25 ; CHECK-NEXT: vmov.16 q3[6], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s27 +; CHECK-NEXT: vmovx.f16 s0, s9 ; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov r2, s27 +; CHECK-NEXT: vmov r2, s9 ; CHECK-NEXT: vmov.16 q7[0], r2 -; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vmov r3, s5 ; CHECK-NEXT: vmov.16 q7[1], r3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmovx.f16 s0, s5 ; CHECK-NEXT: vmov.16 q7[4], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q7[5], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmovx.f16 s16, s18 -; CHECK-NEXT: vmov.16 q3[3], r3 +; CHECK-NEXT: vmov r3, s26 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q2[3], r3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s24 -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vmov.16 q2[6], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s24, s26 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmovx.f16 s4, s6 ; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmovx.f16 s4, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov r3, s27 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmovx.f16 s16, s19 +; CHECK-NEXT: vmov.16 q1[3], r3 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmovx.f16 s16, s10 -; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q6[3], r3 -; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmovx.f16 s16, s27 +; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmovx.f16 s16, s10 -; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s16, s19 +; CHECK-NEXT: vmov.f32 s3, s11 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vmov.f32 s1, s25 -; CHECK-NEXT: vmov.f32 s3, s27 +; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s21, s25 ; CHECK-NEXT: vstrh.16 q0, [r1, #32] -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vmov.f32 s31, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrh.16 q7, [r1, #48] -; CHECK-NEXT: vstrh.16 q1, [r1], #64 -; CHECK-NEXT: vmov.f32 s21, s1 -; CHECK-NEXT: vmov.f32 s23, s3 -; CHECK-NEXT: vstrh.16 q5, [r1, #-48] +; CHECK-NEXT: vmov.16 q4[5], r2 +; CHECK-NEXT: vmov.f32 s29, s13 +; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vstrh.16 q2, [r1, #48] +; CHECK-NEXT: vstrh.16 q5, [r1], #64 +; CHECK-NEXT: vmov.f32 s31, s15 +; CHECK-NEXT: vstrh.16 q7, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: add sp, #80 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 %l0 = bitcast i16 %tmp.0.extract.trunc to half