Index: llvm/include/llvm/CodeGen/MachineInstrBundle.h =================================================================== --- llvm/include/llvm/CodeGen/MachineInstrBundle.h +++ llvm/include/llvm/CodeGen/MachineInstrBundle.h @@ -241,6 +241,13 @@ MachineInstr &MI, Register Reg, SmallVectorImpl> *Ops = nullptr); +/// Return a pair of lane masks (reads, writes) indicating which lanes this +/// instruction uses with Reg. +std::pair +AnalyzeVirtRegLanesInBundle(const MachineInstr &MI, Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI); + /// Information about how a physical register Reg is used by a set of /// operands. struct PhysRegInfo { Index: llvm/lib/CodeGen/InlineSpiller.cpp =================================================================== --- llvm/lib/CodeGen/InlineSpiller.cpp +++ llvm/lib/CodeGen/InlineSpiller.cpp @@ -175,6 +175,7 @@ // All COPY instructions to/from snippets. // They are ignored since both operands refer to the same stack slot. + // For bundled copies, this will only include the first header copy. SmallPtrSet SnippetCopies; // Values that failed to remat at some point. @@ -255,15 +256,58 @@ // This minimizes register pressure and maximizes the store-to-load distance for // spill slots which can be important in tight loops. -/// isFullCopyOf - If MI is a COPY to or from Reg, return the other register, -/// otherwise return 0. -static Register isFullCopyOf(const MachineInstr &MI, Register Reg) { - if (!MI.isFullCopy()) +/// If MI is a COPY to or from Reg, return the other register, otherwise return +/// 0. +static Register isCopyOf(const MachineInstr &MI, Register Reg) { + assert(!MI.isBundled()); + if (!MI.isCopy()) return Register(); - if (MI.getOperand(0).getReg() == Reg) - return MI.getOperand(1).getReg(); - if (MI.getOperand(1).getReg() == Reg) - return MI.getOperand(0).getReg(); + + const MachineOperand &DstOp = MI.getOperand(0); + const MachineOperand &SrcOp = MI.getOperand(1); + + // TODO: Probably only worth allowing subreg copies with undef dests. + if (DstOp.getSubReg() != SrcOp.getSubReg()) + return Register(); + if (DstOp.getReg() == Reg) + return SrcOp.getReg(); + if (SrcOp.getReg() == Reg) + return DstOp.getReg(); + return Register(); +} + +/// Check for a copy bundle as formed by SplitKit. +static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg) { + if (!FirstMI.isBundled()) + return isCopyOf(FirstMI, Reg); + + assert(!FirstMI.isBundledWithPred() && FirstMI.isBundledWithSucc() && + "expected to see first instruction in bundle"); + + Register SnipReg; + MachineBasicBlock::const_instr_iterator I = FirstMI.getIterator(); + while (I->isBundledWithSucc()) { + const MachineInstr &MI = *I; + if (!MI.isCopy()) + return Register(); + + const MachineOperand &DstOp = MI.getOperand(0); + const MachineOperand &SrcOp = MI.getOperand(1); + if (DstOp.getReg() == Reg) { + if (!SnipReg) + SnipReg = SrcOp.getReg(); + else if (SnipReg != SrcOp.getReg()) + return Register(); + } else if (SrcOp.getReg() == Reg) { + if (!SnipReg) + SnipReg = DstOp.getReg(); + else if (SnipReg != DstOp.getReg()) + return Register(); + } + + ++I; + } + return Register(); } @@ -307,14 +351,14 @@ MachineInstr *UseMI = nullptr; // Check that all uses satisfy our criteria. - for (MachineRegisterInfo::reg_instr_nodbg_iterator - RI = MRI.reg_instr_nodbg_begin(SnipLI.reg()), - E = MRI.reg_instr_nodbg_end(); + for (MachineRegisterInfo::reg_bundle_nodbg_iterator + RI = MRI.reg_bundle_nodbg_begin(SnipLI.reg()), + E = MRI.reg_bundle_nodbg_end(); RI != E;) { MachineInstr &MI = *RI++; // Allow copies to/from Reg. - if (isFullCopyOf(MI, Reg)) + if (isCopyOfBundle(MI, Reg)) continue; // Allow stack slot loads. @@ -351,9 +395,8 @@ if (Original == Reg) return; - for (MachineInstr &MI : - llvm::make_early_inc_range(MRI.reg_instructions(Reg))) { - Register SnipReg = isFullCopyOf(MI, Reg); + for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) { + Register SnipReg = isCopyOfBundle(MI, Reg); if (!isSibling(SnipReg)) continue; LiveInterval &SnipLI = LIS.getInterval(SnipReg); @@ -475,7 +518,7 @@ // Find all spills and copies of VNI. for (MachineInstr &MI : - llvm::make_early_inc_range(MRI.use_nodbg_instructions(Reg))) { + llvm::make_early_inc_range(MRI.use_nodbg_bundles(Reg))) { if (!MI.isCopy() && !MI.mayStore()) continue; SlotIndex Idx = LIS.getInstructionIndex(MI); @@ -483,13 +526,14 @@ continue; // Follow sibling copies down the dominator tree. - if (Register DstReg = isFullCopyOf(MI, Reg)) { + if (Register DstReg = isCopyOfBundle(MI, Reg)) { if (isSibling(DstReg)) { - LiveInterval &DstLI = LIS.getInterval(DstReg); - VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot()); - assert(DstVNI && "Missing defined value"); - assert(DstVNI->def == Idx.getRegSlot() && "Wrong copy def slot"); - WorkList.push_back(std::make_pair(&DstLI, DstVNI)); + LiveInterval &DstLI = LIS.getInterval(DstReg); + VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot()); + assert(DstVNI && "Missing defined value"); + assert(DstVNI->def == Idx.getRegSlot() && "Wrong copy def slot"); + + WorkList.push_back(std::make_pair(&DstLI, DstVNI)); } continue; } @@ -1111,7 +1155,7 @@ Idx = VNI->def; // Check for a sibling copy. - Register SibReg = isFullCopyOf(MI, Reg); + Register SibReg = isCopyOfBundle(MI, Reg); if (SibReg && isSibling(SibReg)) { // This may actually be a copy between snippets. if (isRegToSpill(SibReg)) { @@ -1202,8 +1246,8 @@ llvm::make_early_inc_range(MRI.reg_instructions(Reg))) { assert(SnippetCopies.count(&MI) && "Remaining use wasn't a snippet copy"); // FIXME: Do this with a LiveRangeEdit callback. - LIS.RemoveMachineInstrFromMaps(MI); - MI.eraseFromParent(); + LIS.getSlotIndexes()->removeSingleMachineInstrFromMaps(MI); + MI.eraseFromBundle(); } } Index: llvm/lib/CodeGen/LiveRangeEdit.cpp =================================================================== --- llvm/lib/CodeGen/LiveRangeEdit.cpp +++ llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -286,8 +286,12 @@ // Never delete a bundled instruction. if (MI->isBundled()) { + // TODO: Handle deleting copy bundles + LLVM_DEBUG(dbgs() << "Won't delete dead bundled inst: " << Idx << '\t' + << *MI); return; } + // Never delete inline asm. if (MI->isInlineAsm()) { LLVM_DEBUG(dbgs() << "Won't delete: " << Idx << '\t' << *MI); Index: llvm/lib/CodeGen/MachineInstrBundle.cpp =================================================================== --- llvm/lib/CodeGen/MachineInstrBundle.cpp +++ llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -307,6 +307,34 @@ return RI; } +std::pair +llvm::AnalyzeVirtRegLanesInBundle(const MachineInstr &MI, Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) { + + LaneBitmask UseMask, DefMask; + + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + const MachineOperand &MO = *O; + if (!MO.isReg() || MO.getReg() != Reg) + continue; + + unsigned SubReg = MO.getSubReg(); + if (SubReg == 0 && MO.isUse() && !MO.isUndef()) + UseMask |= MRI.getMaxLaneMaskForVReg(Reg); + + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(SubReg); + if (MO.isDef()) { + if (!MO.isUndef()) + UseMask |= ~SubRegMask; + DefMask |= SubRegMask; + } else if (!MO.isUndef()) + UseMask |= SubRegMask; + } + + return {UseMask, DefMask}; +} + PhysRegInfo llvm::AnalyzePhysRegInBundle(const MachineInstr &MI, Register Reg, const TargetRegisterInfo *TRI) { bool AllDefsDead = true; Index: llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir +++ llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir @@ -26,8 +26,7 @@ ; CHECK-NEXT: undef %9.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1 ; CHECK-NEXT: S_NOP 0, implicit %9.sub1 ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: undef %11.sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0 - ; CHECK-NEXT: S_NOP 0, implicit %11.sub0 + ; CHECK-NEXT: S_NOP 0, implicit [[SI_SPILL_V64_RESTORE]].sub0 ; CHECK-NEXT: undef %7.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1 ; CHECK-NEXT: S_NOP 0, implicit %7.sub1 ; CHECK-NEXT: S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -68,26 +68,24 @@ ; CHECK-NEXT: liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr52 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr53 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr54 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr55 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr56 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr57 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr58 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr59 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr60 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr61 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr62 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr63 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr64 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr65 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr66 = COPY renamable $sgpr68 - ; CHECK-NEXT: renamable $sgpr67 = COPY killed renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr36 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr37 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr38 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr39 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr40 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr41 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr42 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr43 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr44 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr45 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr46 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr47 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr48 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr49 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr50 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr51 = COPY killed renamable $sgpr68 ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr68 = COPY killed renamable $sgpr84 - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 - ; CHECK-NEXT: renamable $sgpr52 = COPY renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr52 = COPY killed renamable $sgpr84 ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr53 = COPY killed renamable $sgpr72 ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) @@ -158,7 +156,7 @@ ; CHECK-NEXT: successors: %bb.7(0x80000000) ; CHECK-NEXT: liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead %27:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr22_sgpr23, implicit $exec + ; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr22_sgpr23, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: ; CHECK-NEXT: successors: %bb.8(0x80000000) @@ -166,7 +164,7 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr90_sgpr91 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec ; CHECK-NEXT: renamable $sgpr92_sgpr93 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: dead %30:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr33, 11, implicit-def $m0, implicit $m0, implicit $exec + ; CHECK-NEXT: dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V32_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr33, 11, implicit-def $m0, implicit $m0, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.8: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.9(0x40000000) @@ -182,40 +180,40 @@ ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr68_sgpr69, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY2]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1) - ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec + ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec ; CHECK-NEXT: renamable $sgpr64 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $sgpr4_sgpr5 = COPY renamable $sgpr34_sgpr35 - ; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY killed renamable $sgpr6_sgpr7 - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY renamable $sgpr52_sgpr53 + ; CHECK-NEXT: renamable $sgpr42_sgpr43 = COPY killed renamable $sgpr6_sgpr7 + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY renamable $sgpr42_sgpr43 ; CHECK-NEXT: renamable $sgpr38_sgpr39 = COPY killed renamable $sgpr10_sgpr11 ; CHECK-NEXT: $sgpr10_sgpr11 = COPY renamable $sgpr38_sgpr39 - ; CHECK-NEXT: renamable $sgpr42_sgpr43 = COPY killed renamable $sgpr12_sgpr13 + ; CHECK-NEXT: renamable $sgpr44_sgpr45 = COPY killed renamable $sgpr12_sgpr13 ; CHECK-NEXT: $sgpr12 = COPY renamable $sgpr33 ; CHECK-NEXT: $sgpr13 = COPY renamable $sgpr15 ; CHECK-NEXT: renamable $sgpr36 = COPY killed renamable $sgpr16 ; CHECK-NEXT: renamable $sgpr37 = COPY killed renamable $sgpr15 ; CHECK-NEXT: renamable $sgpr40 = COPY killed renamable $sgpr8 - ; CHECK-NEXT: renamable $sgpr44_sgpr45 = COPY killed renamable $sgpr18_sgpr19 - ; CHECK-NEXT: renamable $sgpr46_sgpr47 = COPY killed renamable $sgpr20_sgpr21 - ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr22_sgpr23 - ; CHECK-NEXT: renamable $sgpr50_sgpr51 = COPY killed renamable $sgpr24_sgpr25 + ; CHECK-NEXT: renamable $sgpr46_sgpr47 = COPY killed renamable $sgpr18_sgpr19 + ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr20_sgpr21 + ; CHECK-NEXT: renamable $sgpr50_sgpr51 = COPY killed renamable $sgpr22_sgpr23 + ; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY killed renamable $sgpr24_sgpr25 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr64_sgpr65 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9 - ; CHECK-NEXT: renamable $sgpr24_sgpr25 = COPY killed renamable $sgpr50_sgpr51 - ; CHECK-NEXT: renamable $sgpr22_sgpr23 = COPY killed renamable $sgpr48_sgpr49 - ; CHECK-NEXT: renamable $sgpr20_sgpr21 = COPY killed renamable $sgpr46_sgpr47 - ; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr44_sgpr45 - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = COPY killed renamable $sgpr42_sgpr43 + ; CHECK-NEXT: renamable $sgpr24_sgpr25 = COPY killed renamable $sgpr52_sgpr53 + ; CHECK-NEXT: renamable $sgpr22_sgpr23 = COPY killed renamable $sgpr50_sgpr51 + ; CHECK-NEXT: renamable $sgpr20_sgpr21 = COPY killed renamable $sgpr48_sgpr49 + ; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr46_sgpr47 + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = COPY killed renamable $sgpr44_sgpr45 + ; CHECK-NEXT: renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr42_sgpr43 ; CHECK-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr40 ; CHECK-NEXT: renamable $sgpr10_sgpr11 = COPY killed renamable $sgpr38_sgpr39 ; CHECK-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr37 ; CHECK-NEXT: renamable $sgpr16 = COPY killed renamable $sgpr36 - ; CHECK-NEXT: renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr52_sgpr53 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $exec = S_MOV_B64_term renamable $sgpr92_sgpr93 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec Index: llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -0,0 +1,133 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 \ +# RUN: -start-before=greedy,0 -stop-after=virtregrewriter,1 -verify-regalloc -o - %s | FileCheck %s + +# This hit an assert when identifying snippet copy bundles from +# running off the end of the block due to using +# MachineBasicBlock::iterator instead of +# MachineBasicBlock::instr_iterator. + +--- | + + define amdgpu_kernel void @kernel() #0 { + bb: + ret void + } + + attributes #0 = { "amdgpu-calls" "amdgpu-waves-per-eu"="10,10" "target-cpu"="gfx900" } + +... +--- +name: kernel +tracksRegLiveness: true +frameInfo: + hasCalls: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + occupancy: 8 +body: | + ; CHECK-LABEL: name: kernel + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr34_sgpr35 = IMPLICIT_DEF + ; CHECK-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr41 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr38_sgpr39 = COPY undef $sgpr8_sgpr9 + ; CHECK-NEXT: renamable $sgpr36_sgpr37 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4) + ; CHECK-NEXT: dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr38_sgpr39, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4) + ; CHECK-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr44_sgpr45, 0, 0 :: (invariant load (s64), align 16, addrspace 4) + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; CHECK-NEXT: $vgpr1 = COPY renamable $sgpr51 + ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; CHECK-NEXT: $vcc = COPY renamable $sgpr40_sgpr41 + ; CHECK-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_CMP_LG_U64 renamable $sgpr4_sgpr5, 0, implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_VCCZ %bb.5, implicit undef $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CMP_EQ_U32 renamable $sgpr8, 0, implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef renamable $vgpr0, undef renamable $vgpr0, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef renamable $vgpr0, undef renamable $vgpr0, renamable $sgpr50_sgpr51, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; CHECK-NEXT: dead renamable $vgpr0 = COPY killed renamable $sgpr49 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37 + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16 + + %0:sgpr_64 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + undef %2.sub1:sreg_64 = IMPLICIT_DEF + %3:sgpr_64 = COPY undef $sgpr8_sgpr9 + %4:sgpr_64 = IMPLICIT_DEF + %5:sgpr_256 = S_LOAD_DWORDX8_IMM %3, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4) + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %3, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + %7:sgpr_256 = S_LOAD_DWORDX8_IMM %3, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4) + %8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %5.sub0_sub1, 0, 0 :: (invariant load (s64), align 16, addrspace 4) + ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + $vgpr1 = COPY %5.sub7 + dead $sgpr30_sgpr31 = SI_CALL undef %8, 0, csr_amdgpu, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + $vcc = COPY %2 + S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + + bb.1: + S_BRANCH %bb.3 + + bb.2: + S_CMP_LG_U64 %7.sub0_sub1, 0, implicit-def $scc + + bb.3: + S_CBRANCH_VCCZ %bb.5, implicit undef $vcc + + bb.4: + S_CMP_EQ_U32 %7.sub4, 0, implicit-def $scc + + bb.5: + %9:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4) + GLOBAL_STORE_DWORD_SADDR undef %1, undef %1, %7.sub2_sub3, 0, 0, implicit $exec :: (store (s32), addrspace 1) + GLOBAL_STORE_DWORD_SADDR undef %1, undef %1, %5.sub6_sub7, 0, 0, implicit $exec :: (store (s32), addrspace 1) + %10:vgpr_32 = COPY %5.sub5 + ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + $sgpr6_sgpr7 = COPY %4 + $sgpr10_sgpr11 = COPY %0 + ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + S_ENDPGM 0 + +... Index: llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -12,14 +12,12 @@ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1769482 /* regdef:VGPR_32 */, def undef %22.sub0 - ; GCN-NEXT: undef %24.sub0:av_64 = COPY %22.sub0 - ; GCN-NEXT: SI_SPILL_AV64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1769482 /* regdef:VGPR_32 */, def undef %24.sub0 + ; GCN-NEXT: SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %16:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; GCN-NEXT: undef %23.sub0:vreg_64 = COPY [[SI_SPILL_AV64_RESTORE]].sub0 - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64 */, %23 + ; GCN-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]] ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0 Index: llvm/test/CodeGen/AMDGPU/swdev380865.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -44,9 +44,9 @@ ; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], 0 ; CHECK-NEXT: s_mov_b32 s2, 0 ; CHECK-NEXT: s_mov_b32 s3, 0x40140000 -; CHECK-NEXT: v_writelane_b32 v0, s0, 9 -; CHECK-NEXT: v_writelane_b32 v0, s6, 10 -; CHECK-NEXT: v_writelane_b32 v0, s7, 11 +; CHECK-NEXT: v_writelane_b32 v0, s6, 9 +; CHECK-NEXT: v_writelane_b32 v0, s7, 10 +; CHECK-NEXT: v_writelane_b32 v0, s0, 11 ; CHECK-NEXT: v_readlane_b32 s6, v0, 1 ; CHECK-NEXT: v_readlane_b32 s7, v0, 2 ; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] @@ -54,8 +54,8 @@ ; CHECK-NEXT: s_mov_b32 s0, s2 ; CHECK-NEXT: v_writelane_b32 v0, s6, 1 ; CHECK-NEXT: v_writelane_b32 v0, s7, 2 -; CHECK-NEXT: v_readlane_b32 s6, v0, 10 -; CHECK-NEXT: v_readlane_b32 s7, v0, 11 +; CHECK-NEXT: v_readlane_b32 s6, v0, 9 +; CHECK-NEXT: v_readlane_b32 s7, v0, 10 ; CHECK-NEXT: s_mov_b32 s6, s2 ; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[0:1] ; CHECK-NEXT: v_readlane_b32 s0, v0, 3 @@ -88,14 +88,16 @@ ; CHECK-NEXT: s_mov_b32 s1, s3 ; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] ; CHECK-NEXT: v_writelane_b32 v0, s0, 7 -; CHECK-NEXT: s_mov_b32 s4, s0 ; CHECK-NEXT: v_writelane_b32 v0, s1, 8 +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0x40140000 +; CHECK-NEXT: s_mov_b32 s4, s0 ; CHECK-NEXT: v_readlane_b32 s0, v0, 0 -; CHECK-NEXT: v_readlane_b32 s2, v0, 9 -; CHECK-NEXT: s_add_i32 s2, s2, s0 -; CHECK-NEXT: v_writelane_b32 v0, s2, 9 +; CHECK-NEXT: v_readlane_b32 s2, v0, 11 ; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[4:5] -; CHECK-NEXT: v_readlane_b32 s0, v0, 9 +; CHECK-NEXT: s_add_i32 s2, s2, s0 +; CHECK-NEXT: v_writelane_b32 v0, s2, 11 +; CHECK-NEXT: v_readlane_b32 s0, v0, 11 ; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup.loopexit Index: llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll =================================================================== --- llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll +++ llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll @@ -35,21 +35,20 @@ ; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_46) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_46) -; CHECK-NEXT: vmv.v.v v8, v10 -; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_45) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_45) -; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v14, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v12, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v14, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vs2r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -264,6 +264,8 @@ ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 ; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: addi a0, sp, 16 +; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: lui a0, %hi(.LCPI10_0) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) ; RV32-V128-NEXT: li a1, 32 @@ -281,8 +283,6 @@ ; RV32-V128-NEXT: addi a0, a0, -1366 ; RV32-V128-NEXT: vmv.s.x v0, a0 ; RV32-V128-NEXT: vrgather.vv v16, v8, v24 -; RV32-V128-NEXT: addi a0, sp, 16 -; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: csrr a0, vlenb ; RV32-V128-NEXT: slli a0, a0, 4 ; RV32-V128-NEXT: add a0, sp, a0 @@ -323,6 +323,8 @@ ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 ; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: addi a0, sp, 16 +; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: lui a0, %hi(.LCPI10_0) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_0) ; RV64-V128-NEXT: li a1, 32 @@ -340,8 +342,6 @@ ; RV64-V128-NEXT: addiw a0, a0, -1366 ; RV64-V128-NEXT: vmv.s.x v0, a0 ; RV64-V128-NEXT: vrgather.vv v16, v8, v24 -; RV64-V128-NEXT: addi a0, sp, 16 -; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: csrr a0, vlenb ; RV64-V128-NEXT: slli a0, a0, 4 ; RV64-V128-NEXT: add a0, sp, a0 Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -420,6 +420,8 @@ ; RV32-V128-NEXT: add a0, sp, a0 ; RV32-V128-NEXT: addi a0, a0, 16 ; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: addi a0, sp, 16 +; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: lui a0, %hi(.LCPI17_0) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) ; RV32-V128-NEXT: li a1, 32 @@ -437,8 +439,6 @@ ; RV32-V128-NEXT: addi a0, a0, -1366 ; RV32-V128-NEXT: vmv.s.x v0, a0 ; RV32-V128-NEXT: vrgather.vv v16, v8, v24 -; RV32-V128-NEXT: addi a0, sp, 16 -; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-V128-NEXT: csrr a0, vlenb ; RV32-V128-NEXT: slli a0, a0, 4 ; RV32-V128-NEXT: add a0, sp, a0 @@ -479,6 +479,8 @@ ; RV64-V128-NEXT: add a0, sp, a0 ; RV64-V128-NEXT: addi a0, a0, 16 ; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: addi a0, sp, 16 +; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: lui a0, %hi(.LCPI17_0) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI17_0) ; RV64-V128-NEXT: li a1, 32 @@ -496,8 +498,6 @@ ; RV64-V128-NEXT: addiw a0, a0, -1366 ; RV64-V128-NEXT: vmv.s.x v0, a0 ; RV64-V128-NEXT: vrgather.vv v16, v8, v24 -; RV64-V128-NEXT: addi a0, sp, 16 -; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-V128-NEXT: csrr a0, vlenb ; RV64-V128-NEXT: slli a0, a0, 4 ; RV64-V128-NEXT: add a0, sp, a0 Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -139,30 +139,36 @@ ; RV32-NEXT: addi a3, a1, 256 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v24, (a3) +; RV32-NEXT: vle32.v v16, (a3) +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 57 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vid.v v16 +; RV32-NEXT: vid.v v24 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 37 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs4r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vadd.vi v8, v16, -4 +; RV32-NEXT: vs4r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vadd.vi v8, v24, -4 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a4, a3, 4 ; RV32-NEXT: add a3, a4, a3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs4r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vrgather.vv v12, v24, v8 +; RV32-NEXT: vrgather.vv v12, v16, v8 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 45 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs4r.v v12, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vadd.vi v8, v16, -10 +; RV32-NEXT: vadd.vi v8, v24, -10 ; RV32-NEXT: lui a3, 12 ; RV32-NEXT: vmv.s.x v0, a3 ; RV32-NEXT: csrr a3, vlenb @@ -172,13 +178,13 @@ ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v16, v24, 16 +; RV32-NEXT: vslidedown.vi v16, v16, 16 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 57 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a4, a3, 6 +; RV32-NEXT: add a3, a4, a3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vrgather.vv v12, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb @@ -187,12 +193,6 @@ ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs4r.v v12, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a4, a3, 6 -; RV32-NEXT: add a3, a4, a3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a1, 128 ; RV32-NEXT: lui a4, %hi(.LCPI6_0) ; RV32-NEXT: addi a4, a4, %lo(.LCPI6_0) @@ -687,22 +687,22 @@ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 90 +; RV64-NEXT: li a3, 86 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xda, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 90 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd6, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 86 * vlenb ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: addi a2, a1, 128 ; RV64-NEXT: vle64.v v16, (a2) ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a3, a2, 6 -; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: li a3, 61 +; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vle64.v v0, (a1) ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 73 +; RV64-NEXT: li a3, 69 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 @@ -712,7 +712,7 @@ ; RV64-NEXT: vmul.vx v8, v8, a2 ; RV64-NEXT: vrgather.vv v24, v0, v8 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 81 +; RV64-NEXT: li a3, 77 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 @@ -720,55 +720,43 @@ ; RV64-NEXT: li a2, 56 ; RV64-NEXT: vmv.s.x v0, a2 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 45 +; RV64-NEXT: li a3, 41 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vs1r.v v0, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vadd.vi v8, v8, -16 ; RV64-NEXT: vrgather.vv v24, v16, v8, v0.t -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a3, a2, 5 -; RV64-NEXT: add a2, a3, a2 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; RV64-NEXT: addi a1, a1, 256 ; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 53 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: li a1, 128 ; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 41 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vrgather.vi v8, v16, 4 ; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v16, 8 +; RV64-NEXT: vslidedown.vi v16, v16, 8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v8, v24, 2, v0.t -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 49 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vrgather.vi v8, v16, 2, v0.t ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 5 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv.v.v v8, v16 +; RV64-NEXT: vmv.v.v v8, v24 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 29 ; RV64-NEXT: mul a1, a1, a2 @@ -777,14 +765,14 @@ ; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 81 +; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vi v8, v16, 1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 73 +; RV64-NEXT: li a2, 69 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -792,34 +780,34 @@ ; RV64-NEXT: vrgather.vv v24, v0, v8 ; RV64-NEXT: vadd.vi v8, v16, -15 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 41 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 6 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 61 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 53 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vi v8, v16, 5 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 41 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 49 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -841,18 +829,18 @@ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v20, zero +; RV64-NEXT: vmv.s.x v12, zero ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vslideup.vi v8, v20, 5 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 13 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vslideup.vi v8, v12, 5 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 53 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -860,21 +848,21 @@ ; RV64-NEXT: vrgather.vv v12, v24, v8 ; RV64-NEXT: vrgather.vi v12, v16, 4, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 41 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 81 +; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vi v0, v8, 2 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 73 +; RV64-NEXT: li a2, 69 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -886,22 +874,22 @@ ; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vadd.vi v8, v8, -14 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 6 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 61 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vv v24, v16, v8, v0.t ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 41 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vmv.v.v v8, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 41 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -917,50 +905,50 @@ ; RV64-NEXT: vs4r.v v8, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vmv.s.x v12, a1 ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vslideup.vi v8, v12, 5 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a2, a1, 3 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vslideup.vi v8, v12, 5 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 53 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vv v16, v24, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 41 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 49 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vi v16, v8, 5, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 41 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 81 +; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vi v16, v0, 3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 73 +; RV64-NEXT: li a2, 69 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -974,8 +962,8 @@ ; RV64-NEXT: addi a1, sp, 16 ; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 6 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 61 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -986,14 +974,14 @@ ; RV64-NEXT: vrgather.vv v16, v24, v8, v0.t ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 41 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vmv.v.v v8, v16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 41 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -1020,14 +1008,14 @@ ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 53 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vi v20, v8, 2 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 49 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -1041,14 +1029,14 @@ ; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 81 +; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vi v16, v0, 4 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 73 +; RV64-NEXT: li a2, 69 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -1066,8 +1054,8 @@ ; RV64-NEXT: vmv1r.v v0, v16 ; RV64-NEXT: vmv1r.v v1, v16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 6 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 61 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -1108,7 +1096,7 @@ ; RV64-NEXT: vslideup.vi v16, v8, 6 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 53 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -1119,7 +1107,7 @@ ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 49 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -1127,21 +1115,21 @@ ; RV64-NEXT: vrgather.vv v4, v8, v16, v0.t ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 81 +; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vadd.vi v16, v8, 5 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 73 +; RV64-NEXT: li a2, 69 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vrgather.vv v8, v24, v16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 81 +; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -1149,8 +1137,8 @@ ; RV64-NEXT: vadd.vi v16, v16, -11 ; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 6 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 61 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload @@ -1170,7 +1158,7 @@ ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 192 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 41 +; RV64-NEXT: li a3, 37 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 @@ -1178,7 +1166,7 @@ ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 45 +; RV64-NEXT: li a3, 41 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 @@ -1200,7 +1188,7 @@ ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 90 +; RV64-NEXT: li a1, 86 ; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -1563,11 +1563,11 @@ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v16, (a1) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v8, 16 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vslidedown.vi v0, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv4r.v v8, v0 @@ -1611,11 +1611,11 @@ ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vle32.v v16, (a1) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v8, 16 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vslidedown.vi v0, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vmv4r.v v8, v0 @@ -1662,11 +1662,11 @@ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v16, (a1) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v8, 16 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vslidedown.vi v0, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv4r.v v8, v0 @@ -1710,11 +1710,11 @@ ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vle32.v v16, (a1) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v8, 16 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vslidedown.vi v0, v16, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vmv4r.v v8, v0 Index: llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll +++ llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll @@ -210,62 +210,62 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldr s23, [sp, #124] +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: vldr s23, [sp, #140] ; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vldr s22, [sp, #116] +; CHECK-NEXT: vldr s22, [sp, #132] ; CHECK-NEXT: vmov.f32 s25, s11 ; CHECK-NEXT: vmov.f32 s13, s10 -; CHECK-NEXT: vldr s19, [sp, #120] +; CHECK-NEXT: vldr s19, [sp, #136] ; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vldr s18, [sp, #112] +; CHECK-NEXT: vldr s18, [sp, #128] ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vldr s31, [sp, #172] +; CHECK-NEXT: vldr s31, [sp, #188] ; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vldr s30, [sp, #164] +; CHECK-NEXT: vldr s30, [sp, #180] ; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vldr s29, [sp, #156] +; CHECK-NEXT: vldr s29, [sp, #172] ; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vldr s28, [sp, #148] +; CHECK-NEXT: vldr s28, [sp, #164] ; CHECK-NEXT: vmov.f32 s4, s1 ; CHECK-NEXT: vmov.f32 s24, s9 ; CHECK-NEXT: vmov.f32 s16, s12 -; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s12, s8 -; CHECK-NEXT: vldr s27, [sp, #168] +; CHECK-NEXT: vldr s27, [sp, #184] ; CHECK-NEXT: vmov.f32 s17, s14 -; CHECK-NEXT: vldr s26, [sp, #160] +; CHECK-NEXT: vldr s26, [sp, #176] ; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vldr s25, [sp, #152] +; CHECK-NEXT: vldr s25, [sp, #168] ; CHECK-NEXT: vmov.f32 s8, s0 ; CHECK-NEXT: vmul.f32 q0, q5, q1 ; CHECK-NEXT: vmul.f32 q1, q4, q1 ; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vldr s24, [sp, #144] +; CHECK-NEXT: vldr s24, [sp, #160] ; CHECK-NEXT: vfma.f32 q1, q5, q2 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vsub.f32 q6, q6, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vldr s13, [sp, #140] +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldr s13, [sp, #156] ; CHECK-NEXT: vfma.f32 q1, q4, q2 -; CHECK-NEXT: vldr s12, [sp, #132] +; CHECK-NEXT: vldr s12, [sp, #148] ; CHECK-NEXT: vadd.f32 q1, q7, q1 -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldr s1, [sp, #136] -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldr s1, [sp, #152] +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill ; CHECK-NEXT: vmul.f32 q2, q3, q7 -; CHECK-NEXT: vldr s0, [sp, #128] -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldr s0, [sp, #144] +; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vneg.f32 q2, q2 -; CHECK-NEXT: vldr s21, [sp, #184] +; CHECK-NEXT: vldr s21, [sp, #200] ; CHECK-NEXT: vfma.f32 q2, q0, q3 ; CHECK-NEXT: vmul.f32 q0, q0, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldr s20, [sp, #176] -; CHECK-NEXT: vldr s17, [sp, #188] -; CHECK-NEXT: vldr s16, [sp, #180] +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vldr s20, [sp, #192] +; CHECK-NEXT: vldr s17, [sp, #204] +; CHECK-NEXT: vldr s16, [sp, #196] ; CHECK-NEXT: vfma.f32 q0, q7, q3 ; CHECK-NEXT: vsub.f32 q3, q5, q0 ; CHECK-NEXT: vmov.f32 s1, s4 @@ -280,7 +280,7 @@ ; CHECK-NEXT: vmov.f32 s9, s16 ; CHECK-NEXT: vmov.f32 s10, s13 ; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-vst3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -197,46 +197,45 @@ ; CHECK-NEXT: vmov.f32 s19, s10 ; CHECK-NEXT: vmov.f32 s13, s8 ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s20, s12 +; CHECK-NEXT: vmov.f32 s18, s6 ; CHECK-NEXT: vmov.f64 d14, d4 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov.f32 s5, s27 +; CHECK-NEXT: vmov.f32 s8, s24 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.f32 s24, s1 +; CHECK-NEXT: vmov.f32 s27, s2 +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov r0, r3, d14 ; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.f32 s8, s24 -; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vstrw.32 q0, [r1, #128] ; CHECK-NEXT: vmov.f32 s11, s25 +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s20, s12 +; CHECK-NEXT: vmov.32 q6[1], r3 ; CHECK-NEXT: vmov.f32 s12, s4 +; CHECK-NEXT: vstrw.32 q6, [r1, #64] ; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.32 q2[2], r0 ; CHECK-NEXT: vmov r0, lr, d14 ; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.f64 d12, d14 -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vstrw.32 q5, [r1, #144] -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s5, s27 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov.f32 s24, s1 -; CHECK-NEXT: vstrw.32 q1, [r1, #80] -; CHECK-NEXT: vmov.f32 s27, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov r2, r4, d14 -; CHECK-NEXT: vmov.32 q6[1], r3 -; CHECK-NEXT: vstrw.32 q0, [r1, #128] -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q4[1], r4 ; CHECK-NEXT: vmov.32 q0[1], lr -; CHECK-NEXT: vstrw.32 q6, [r1, #64] +; CHECK-NEXT: vmov.32 q5[2], r0 ; CHECK-NEXT: vstrw.32 q0, [r1, #160] ; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [r1, #96] -; CHECK-NEXT: vstrw.32 q4, [r1, #112] +; CHECK-NEXT: vmov r2, r4, d14 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1, #176] ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.32 q4[1], r4 ; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vstrw.32 q1, [r1, #80] +; CHECK-NEXT: vstrw.32 q3, [r1, #96] +; CHECK-NEXT: vstrw.32 q4, [r1, #112] +; CHECK-NEXT: vstrw.32 q5, [r1, #144] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #160 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -1017,14 +1016,13 @@ ; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s23, s3 +; CHECK-NEXT: vmov.f32 s0, s2 ; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov.f32 s1, s15 +; CHECK-NEXT: vmov.f32 s2, s11 ; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vmov.f32 s0, s12 ; CHECK-NEXT: vmov.f32 s1, s8 ; CHECK-NEXT: vmov.f32 s3, s13