Index: llvm/include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -362,6 +362,14 @@ /// to which instructions should be sunk. virtual bool shouldSink(const MachineInstr &MI) const { return true; } + /// Return true if the cheap instruction can be hoisted by MachineLICM, given + /// the register pressure cost delta in Cost. + virtual bool + shouldHoistCheapInsts(const MachineInstr &MI, + const DenseMap &Cost) const { + return false; + } + /// Re-issue the specified 'original' instruction at the /// specific location targeting a new destination register. /// The register in Orig->getOperand(0).getReg() will be substituted by Index: llvm/lib/CodeGen/MachineLICM.cpp =================================================================== --- llvm/lib/CodeGen/MachineLICM.cpp +++ llvm/lib/CodeGen/MachineLICM.cpp @@ -225,7 +225,8 @@ bool IsCheapInstruction(MachineInstr &MI) const; - bool CanCauseHighRegPressure(const DenseMap &Cost, + bool CanCauseHighRegPressure(const MachineInstr &MI, + const DenseMap &Cost, bool Cheap); void UpdateBackTraceRegPressure(const MachineInstr *MI); @@ -1181,9 +1182,9 @@ /// Visit BBs from header to current BB, check if hoisting an instruction of the /// given cost matrix can cause high register pressure. -bool -MachineLICMBase::CanCauseHighRegPressure(const DenseMap& Cost, - bool CheapInstr) { +bool MachineLICMBase::CanCauseHighRegPressure( + const MachineInstr &MI, const DenseMap &Cost, + bool CheapInstr) { for (const auto &RPIdAndCost : Cost) { if (RPIdAndCost.second <= 0) continue; @@ -1193,7 +1194,9 @@ // Don't hoist cheap instructions if they would increase register pressure, // even if we're under the limit. - if (CheapInstr && !HoistCheapInsts) + if (CheapInstr && !(HoistCheapInsts.getNumOccurrences() + ? HoistCheapInsts + : TII->shouldHoistCheapInsts(MI, Cost))) return true; for (const auto &RP : BackTrace) @@ -1281,7 +1284,7 @@ // Visit BBs from header to current BB, if hoisting this doesn't cause // high register pressure, then it's safe to proceed. - if (!CanCauseHighRegPressure(Cost, CheapInstr)) { + if (!CanCauseHighRegPressure(MI, Cost, CheapInstr)) { LLVM_DEBUG(dbgs() << "Hoist non-reg-pressure: " << MI); ++NumLowRP; return true; Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -231,6 +231,10 @@ bool shouldSink(const MachineInstr &MI) const override; + bool + shouldHoistCheapInsts(const MachineInstr &MI, + const DenseMap &Cost) const override; + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, unsigned SubIdx, const MachineInstr &Orig, Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -3224,6 +3224,17 @@ return true; } +bool ARMBaseInstrInfo::shouldHoistCheapInsts( + const MachineInstr &MI, const DenseMap &Cost) const { + // Especially in tight MVE loops, most instructions are not really "cheap" + // (even movs/copies), and should all be pulled out of loops if available. + // The vector instructions that accept gpr ergisters from float values can + // also leave copies in the loops. Thumb2 cores have enough registers to make + // this generally useful (MachineLICM will already handle the case where we + // are actually at the register limit). + return Subtarget.isMClass() && Subtarget.isThumb2(); +} + bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, MachineRegisterInfo *MRI) const { Index: llvm/test/CodeGen/Thumb2/mve-floatregloops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-floatregloops.ll +++ llvm/test/CodeGen/Thumb2/mve-floatregloops.ll @@ -7,13 +7,13 @@ ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.f32 q1, q1, r3 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vadd.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -52,13 +52,13 @@ ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.f32 q1, q1, r3 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vadd.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -97,13 +97,13 @@ ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmul.f32 q1, q1, r3 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vmul.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -142,13 +142,13 @@ ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmul.f32 q1, q1, r3 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vmul.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -187,13 +187,13 @@ ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vsub.f32 q1, q1, r3 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vsub.f32 q0, q0, r3 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: bne .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -279,14 +279,14 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfmas.f32 q2, q1, r12 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vfmas.f32 q1, q0, r12 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -329,14 +329,14 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfmas.f32 q2, q1, r12 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vfmas.f32 q1, q0, r12 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -379,14 +379,14 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q2, q1, r12 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vfma.f32 q1, q0, r12 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -429,15 +429,15 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: vdup.32 q1, r12 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q3, q1, q2 -; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: vfma.f32 q2, q0, q1 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: bne .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -481,16 +481,17 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vneg.f32 q0, q0 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vdup.32 q1, r12 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q1, q3, q2 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vfma.f32 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 ; CHECK-NEXT: bne .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -585,15 +586,15 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: .LBB12_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vneg.f32 q1, q1 -; CHECK-NEXT: vfma.f32 q1, q2, r12 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: vfma.f32 q0, q1, r12 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: bne .LBB12_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -636,16 +637,16 @@ ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: it lt ; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 ; CHECK-NEXT: .LBB13_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: vldrw.u32 q3, [r0], #16 -; CHECK-NEXT: vdup.32 q1, r12 -; CHECK-NEXT: vneg.f32 q2, q2 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vfma.f32 q2, q1, q3 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q0, q2 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: bne .LBB13_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: bx lr @@ -781,13 +782,13 @@ ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: ldrh r4, [r0] -; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: ldr r5, [r0, #4] ; CHECK-NEXT: subs r7, r4, #1 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhi .LBB15_6 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: ldr r6, [r0, #8] -; CHECK-NEXT: add.w r11, r12, r7, lsl #2 +; CHECK-NEXT: add.w r11, r5, r7, lsl #2 ; CHECK-NEXT: lsr.w lr, r3, #2 ; CHECK-NEXT: vldr s0, [r6] ; CHECK-NEXT: vldr s2, [r6, #4] @@ -795,41 +796,43 @@ ; CHECK-NEXT: vldr s6, [r6, #12] ; CHECK-NEXT: wls lr, lr, .LBB15_5 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph -; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: bic r4, r3, #3 +; CHECK-NEXT: strd r3, r4, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: vmov r10, s4 +; CHECK-NEXT: vmov r12, s6 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: vmov r8, s0 +; CHECK-NEXT: add.w r3, r2, r3, lsl #2 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r4, r2, r4, lsl #2 -; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: .LBB15_3: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: adds r7, r1, r6 -; CHECK-NEXT: add.w r5, r12, r6 -; CHECK-NEXT: vldrw.u32 q2, [r7] +; CHECK-NEXT: add.w r9, r1, r6 ; CHECK-NEXT: add.w r7, r11, r6 -; CHECK-NEXT: vmov r10, s0 +; CHECK-NEXT: vldrw.u32 q2, [r9] +; CHECK-NEXT: adds r5, r3, r6 ; CHECK-NEXT: vstrw.32 q2, [r7] -; CHECK-NEXT: vmov r9, s2 -; CHECK-NEXT: vldrw.u32 q2, [r5] -; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: adds r7, r2, r6 +; CHECK-NEXT: vldrw.u32 q2, [r5] ; CHECK-NEXT: adds r6, #16 -; CHECK-NEXT: vmul.f32 q2, q2, r10 +; CHECK-NEXT: vmul.f32 q2, q2, r8 ; CHECK-NEXT: vldrw.u32 q3, [r5, #4] -; CHECK-NEXT: vmov r8, s6 -; CHECK-NEXT: vfma.f32 q2, q3, r9 +; CHECK-NEXT: vfma.f32 q2, q3, r4 ; CHECK-NEXT: vldrw.u32 q3, [r5, #8] ; CHECK-NEXT: vldrw.u32 q4, [r5, #12] -; CHECK-NEXT: vfma.f32 q2, q3, r4 -; CHECK-NEXT: vfma.f32 q2, q4, r8 +; CHECK-NEXT: vfma.f32 q2, q3, r10 +; CHECK-NEXT: vfma.f32 q2, q4, r12 ; CHECK-NEXT: vstrw.32 q2, [r7] ; CHECK-NEXT: le lr, .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: add r4, sp, #4 ; CHECK-NEXT: add r11, r6 -; CHECK-NEXT: add.w r12, r12, r2, lsl #2 +; CHECK-NEXT: add.w r5, r3, r2, lsl #2 ; CHECK-NEXT: add.w r1, r1, r2, lsl #2 -; CHECK-NEXT: ldrd r2, r4, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: ldm r4, {r2, r3, r4} @ 12-byte Folded Reload ; CHECK-NEXT: .LBB15_5: @ %while.end ; CHECK-NEXT: and r7, r3, #3 ; CHECK-NEXT: vldrw.u32 q2, [r1] @@ -837,36 +840,37 @@ ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q2, [r11] ; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vldrw.u32 q0, [r12] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: vmov r12, s6 ; CHECK-NEXT: vmov r7, s4 -; CHECK-NEXT: vmul.f32 q0, q0, r5 -; CHECK-NEXT: vldrw.u32 q1, [r12, #4] +; CHECK-NEXT: vmul.f32 q0, q0, r1 +; CHECK-NEXT: vldrw.u32 q1, [r5, #4] ; CHECK-NEXT: vfma.f32 q0, q1, r6 -; CHECK-NEXT: vldrw.u32 q1, [r12, #8] +; CHECK-NEXT: vldrw.u32 q1, [r5, #8] ; CHECK-NEXT: vfma.f32 q0, q1, r7 -; CHECK-NEXT: vldrw.u32 q1, [r12, #12] -; CHECK-NEXT: vfma.f32 q0, q1, r1 +; CHECK-NEXT: vldrw.u32 q1, [r5, #12] +; CHECK-NEXT: vfma.f32 q0, q1, r12 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r2] -; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: ldr r5, [r0, #4] ; CHECK-NEXT: .LBB15_6: @ %if.end -; CHECK-NEXT: add.w r0, r12, r3, lsl #2 +; CHECK-NEXT: add.w r0, r5, r3, lsl #2 +; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: lsr.w lr, r4, #2 ; CHECK-NEXT: wls lr, lr, .LBB15_10 ; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader -; CHECK-NEXT: bic r2, r4, #3 -; CHECK-NEXT: adds r1, r2, r3 -; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: add.w r1, r12, r1, lsl #2 +; CHECK-NEXT: bic r7, r4, #3 +; CHECK-NEXT: adds r1, r7, r3 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: .LBB15_8: @ %while.body51 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 ; CHECK-NEXT: vstrb.8 q0, [r3], #16 ; CHECK-NEXT: le lr, .LBB15_8 ; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit -; CHECK-NEXT: add.w r12, r12, r2, lsl #2 +; CHECK-NEXT: add.w r2, r2, r7, lsl #2 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: .LBB15_10: @ %while.end55 ; CHECK-NEXT: ands r1, r4, #3 @@ -875,7 +879,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r12] +; CHECK-NEXT: vstrwt.32 q0, [r2] ; CHECK-NEXT: .LBB15_12: @ %if.end61 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} Index: llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll @@ -187,20 +187,19 @@ ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcmp.f32 ge, q2, r1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.f32 le, q2, r1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpt.f32 ge, q1, r1 +; CHECK-NEXT: vcmpt.f32 le, q1, r2 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q1, [r0], #16 +; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -249,18 +248,18 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: vneg.f16 s2, s0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrh.u16 q2, [r0] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vpt.f16 ge, q2, r2 -; CHECK-NEXT: vcmpt.f16 le, q2, r1 +; CHECK-NEXT: vldrh.u16 q1, [r0] +; CHECK-NEXT: vpt.f16 ge, q1, r2 +; CHECK-NEXT: vcmpt.f16 le, q1, r1 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q1, [r0], #16 +; CHECK-NEXT: vstrht.16 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -487,20 +486,19 @@ ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #2 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vcmp.f32 ge, q2, r1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.f32 le, q2, r1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpt.f32 ge, q1, r1 +; CHECK-NEXT: vcmpt.f32 le, q1, r2 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q1, [r0], #16 +; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -549,18 +547,18 @@ ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: vneg.f16 s2, s0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrh.u16 q2, [r0] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vpt.f16 ge, q2, r2 -; CHECK-NEXT: vcmpt.f16 le, q2, r1 +; CHECK-NEXT: vldrh.u16 q1, [r0] +; CHECK-NEXT: vpt.f16 ge, q1, r2 +; CHECK-NEXT: vcmpt.f16 le, q1, r1 ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrht.16 q1, [r0], #16 +; CHECK-NEXT: vstrht.16 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc}