Index: llvm/include/llvm/CodeGen/MachineInstrBundle.h =================================================================== --- llvm/include/llvm/CodeGen/MachineInstrBundle.h +++ llvm/include/llvm/CodeGen/MachineInstrBundle.h @@ -241,6 +241,13 @@ MachineInstr &MI, Register Reg, SmallVectorImpl> *Ops = nullptr); +/// Return a pair of lane masks (reads, writes) indicating which lanes this +/// instruction uses with Reg. +std::pair +AnalyzeVirtRegLanesInBundle(const MachineInstr &MI, Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI); + /// Information about how a physical register Reg is used by a set of /// operands. struct PhysRegInfo { Index: llvm/lib/CodeGen/InlineSpiller.cpp =================================================================== --- llvm/lib/CodeGen/InlineSpiller.cpp +++ llvm/lib/CodeGen/InlineSpiller.cpp @@ -175,6 +175,7 @@ // All COPY instructions to/from snippets. // They are ignored since both operands refer to the same stack slot. + // For bundled copies, this will only include the first header copy. SmallPtrSet SnippetCopies; // Values that failed to remat at some point. @@ -255,15 +256,58 @@ // This minimizes register pressure and maximizes the store-to-load distance for // spill slots which can be important in tight loops. -/// isFullCopyOf - If MI is a COPY to or from Reg, return the other register, -/// otherwise return 0. -static Register isFullCopyOf(const MachineInstr &MI, Register Reg) { - if (!MI.isFullCopy()) +/// If MI is a COPY to or from Reg, return the other register, otherwise return +/// 0. +static Register isCopyOf(const MachineInstr &MI, Register Reg) { + assert(!MI.isBundled()); + if (!MI.isCopy()) return Register(); - if (MI.getOperand(0).getReg() == Reg) - return MI.getOperand(1).getReg(); - if (MI.getOperand(1).getReg() == Reg) - return MI.getOperand(0).getReg(); + + const MachineOperand &DstOp = MI.getOperand(0); + const MachineOperand &SrcOp = MI.getOperand(1); + + // TODO: Probably only worth allowing subreg copies with undef dests. + if (DstOp.getSubReg() != SrcOp.getSubReg()) + return Register(); + if (DstOp.getReg() == Reg) + return SrcOp.getReg(); + if (SrcOp.getReg() == Reg) + return DstOp.getReg(); + return Register(); +} + +/// Check for a copy bundle as formed by SplitKit. +static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg) { + if (!FirstMI.isBundled()) + return isCopyOf(FirstMI, Reg); + + assert(!FirstMI.isBundledWithPred() && FirstMI.isBundledWithSucc() && + "expected to see first instruction in bundle"); + + Register SnipReg; + MachineBasicBlock::const_iterator I = FirstMI.getIterator(); + while (I->isBundledWithSucc()) { + const MachineInstr &MI = *I; + if (!MI.isCopy()) + return Register(); + + const MachineOperand &DstOp = MI.getOperand(0); + const MachineOperand &SrcOp = MI.getOperand(1); + if (DstOp.getReg() == Reg) { + if (!SnipReg) + SnipReg = SrcOp.getReg(); + else if (SnipReg != SrcOp.getReg()) + return Register(); + } else if (SrcOp.getReg() == Reg) { + if (!SnipReg) + SnipReg = DstOp.getReg(); + else if (SnipReg != DstOp.getReg()) + return Register(); + } + + ++I; + } + return Register(); } @@ -307,14 +351,14 @@ MachineInstr *UseMI = nullptr; // Check that all uses satisfy our criteria. - for (MachineRegisterInfo::reg_instr_nodbg_iterator - RI = MRI.reg_instr_nodbg_begin(SnipLI.reg()), - E = MRI.reg_instr_nodbg_end(); + for (MachineRegisterInfo::reg_bundle_nodbg_iterator + RI = MRI.reg_bundle_nodbg_begin(SnipLI.reg()), + E = MRI.reg_bundle_nodbg_end(); RI != E;) { MachineInstr &MI = *RI++; // Allow copies to/from Reg. - if (isFullCopyOf(MI, Reg)) + if (isCopyOfBundle(MI, Reg)) continue; // Allow stack slot loads. @@ -351,9 +395,8 @@ if (Original == Reg) return; - for (MachineInstr &MI : - llvm::make_early_inc_range(MRI.reg_instructions(Reg))) { - Register SnipReg = isFullCopyOf(MI, Reg); + for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) { + Register SnipReg = isCopyOfBundle(MI, Reg); if (!isSibling(SnipReg)) continue; LiveInterval &SnipLI = LIS.getInterval(SnipReg); @@ -475,7 +518,7 @@ // Find all spills and copies of VNI. for (MachineInstr &MI : - llvm::make_early_inc_range(MRI.use_nodbg_instructions(Reg))) { + llvm::make_early_inc_range(MRI.use_nodbg_bundles(Reg))) { if (!MI.isCopy() && !MI.mayStore()) continue; SlotIndex Idx = LIS.getInstructionIndex(MI); @@ -483,13 +526,14 @@ continue; // Follow sibling copies down the dominator tree. - if (Register DstReg = isFullCopyOf(MI, Reg)) { + if (Register DstReg = isCopyOfBundle(MI, Reg)) { if (isSibling(DstReg)) { - LiveInterval &DstLI = LIS.getInterval(DstReg); - VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot()); - assert(DstVNI && "Missing defined value"); - assert(DstVNI->def == Idx.getRegSlot() && "Wrong copy def slot"); - WorkList.push_back(std::make_pair(&DstLI, DstVNI)); + LiveInterval &DstLI = LIS.getInterval(DstReg); + VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot()); + assert(DstVNI && "Missing defined value"); + assert(DstVNI->def == Idx.getRegSlot() && "Wrong copy def slot"); + + WorkList.push_back(std::make_pair(&DstLI, DstVNI)); } continue; } @@ -1111,7 +1155,7 @@ Idx = VNI->def; // Check for a sibling copy. - Register SibReg = isFullCopyOf(MI, Reg); + Register SibReg = isCopyOfBundle(MI, Reg); if (SibReg && isSibling(SibReg)) { // This may actually be a copy between snippets. if (isRegToSpill(SibReg)) { @@ -1202,8 +1246,8 @@ llvm::make_early_inc_range(MRI.reg_instructions(Reg))) { assert(SnippetCopies.count(&MI) && "Remaining use wasn't a snippet copy"); // FIXME: Do this with a LiveRangeEdit callback. - LIS.RemoveMachineInstrFromMaps(MI); - MI.eraseFromParent(); + LIS.getSlotIndexes()->removeSingleMachineInstrFromMaps(MI); + MI.eraseFromBundle(); } } Index: llvm/lib/CodeGen/LiveRangeEdit.cpp =================================================================== --- llvm/lib/CodeGen/LiveRangeEdit.cpp +++ llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -286,8 +286,12 @@ // Never delete a bundled instruction. if (MI->isBundled()) { + // TODO: Handle deleting copy bundles + LLVM_DEBUG(dbgs() << "Won't delete dead bundled inst: " << Idx << '\t' + << *MI); return; } + // Never delete inline asm. if (MI->isInlineAsm()) { LLVM_DEBUG(dbgs() << "Won't delete: " << Idx << '\t' << *MI); Index: llvm/lib/CodeGen/MachineInstrBundle.cpp =================================================================== --- llvm/lib/CodeGen/MachineInstrBundle.cpp +++ llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -310,6 +310,34 @@ return RI; } +std::pair +llvm::AnalyzeVirtRegLanesInBundle(const MachineInstr &MI, Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) { + + LaneBitmask UseMask, DefMask; + + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + const MachineOperand &MO = *O; + if (!MO.isReg() || MO.getReg() != Reg) + continue; + + unsigned SubReg = MO.getSubReg(); + if (SubReg == 0 && MO.isUse() && !MO.isUndef()) + UseMask |= MRI.getMaxLaneMaskForVReg(Reg); + + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(SubReg); + if (MO.isDef()) { + if (!MO.isUndef()) + UseMask |= ~SubRegMask; + DefMask |= SubRegMask; + } else if (!MO.isUndef()) + UseMask |= SubRegMask; + } + + return {UseMask, DefMask}; +} + PhysRegInfo llvm::AnalyzePhysRegInBundle(const MachineInstr &MI, Register Reg, const TargetRegisterInfo *TRI) { bool AllDefsDead = true; Index: llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir +++ llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir @@ -59,11 +59,9 @@ ; CHECK-LABEL: name: split_instruction_subranges_use_is_subreg_def ; CHECK: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %1:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: undef %16.sub0:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR1]].sub0 - ; CHECK-NEXT: SI_SPILL_V64_SAVE %16, %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5) ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: undef %10.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1 - ; CHECK-NEXT: SI_SPILL_V64_SAVE %10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR2]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0 ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: undef %14.sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0 Index: llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -12,14 +12,12 @@ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1769482 /* regdef:VGPR_32 */, def undef %22.sub0 - ; GCN-NEXT: undef %24.sub0:av_64 = COPY %22.sub0 - ; GCN-NEXT: SI_SPILL_AV64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1769482 /* regdef:VGPR_32 */, def undef %24.sub0 + ; GCN-NEXT: SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %16:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) - ; GCN-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) - ; GCN-NEXT: undef %23.sub0:vreg_64 = COPY [[SI_SPILL_AV64_RESTORE]].sub0 - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64 */, %23 + ; GCN-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]] ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0 Index: llvm/test/CodeGen/AMDGPU/swdev380865.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -44,9 +44,9 @@ ; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], 0 ; CHECK-NEXT: s_mov_b32 s2, 0 ; CHECK-NEXT: s_mov_b32 s3, 0x40140000 -; CHECK-NEXT: v_writelane_b32 v0, s0, 9 -; CHECK-NEXT: v_writelane_b32 v0, s6, 10 -; CHECK-NEXT: v_writelane_b32 v0, s7, 11 +; CHECK-NEXT: v_writelane_b32 v0, s6, 9 +; CHECK-NEXT: v_writelane_b32 v0, s7, 10 +; CHECK-NEXT: v_writelane_b32 v0, s0, 11 ; CHECK-NEXT: v_readlane_b32 s6, v0, 1 ; CHECK-NEXT: v_readlane_b32 s7, v0, 2 ; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] @@ -54,8 +54,8 @@ ; CHECK-NEXT: s_mov_b32 s0, s2 ; CHECK-NEXT: v_writelane_b32 v0, s6, 1 ; CHECK-NEXT: v_writelane_b32 v0, s7, 2 -; CHECK-NEXT: v_readlane_b32 s6, v0, 10 -; CHECK-NEXT: v_readlane_b32 s7, v0, 11 +; CHECK-NEXT: v_readlane_b32 s6, v0, 9 +; CHECK-NEXT: v_readlane_b32 s7, v0, 10 ; CHECK-NEXT: s_mov_b32 s6, s2 ; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[0:1] ; CHECK-NEXT: v_readlane_b32 s0, v0, 3 @@ -88,14 +88,16 @@ ; CHECK-NEXT: s_mov_b32 s1, s3 ; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] ; CHECK-NEXT: v_writelane_b32 v0, s0, 7 -; CHECK-NEXT: s_mov_b32 s4, s0 ; CHECK-NEXT: v_writelane_b32 v0, s1, 8 +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0x40140000 +; CHECK-NEXT: s_mov_b32 s4, s0 ; CHECK-NEXT: v_readlane_b32 s0, v0, 0 -; CHECK-NEXT: v_readlane_b32 s2, v0, 9 -; CHECK-NEXT: s_add_i32 s2, s2, s0 -; CHECK-NEXT: v_writelane_b32 v0, s2, 9 +; CHECK-NEXT: v_readlane_b32 s2, v0, 11 ; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[4:5] -; CHECK-NEXT: v_readlane_b32 s0, v0, 9 +; CHECK-NEXT: s_add_i32 s2, s2, s0 +; CHECK-NEXT: v_writelane_b32 v0, s2, 11 +; CHECK-NEXT: v_readlane_b32 s0, v0, 11 ; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup.loopexit Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -264,9 +264,9 @@ ; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-V128-NEXT: vle32.v v0, (a0) ; RV32-V128-NEXT: vmv8r.v v24, v8 -; RV32-V128-NEXT: vrgather.vv v8, v24, v0 ; RV32-V128-NEXT: addi a0, sp, 16 -; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: vrgather.vv v8, v24, v0 ; RV32-V128-NEXT: lui a0, %hi(.LCPI10_1) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) ; RV32-V128-NEXT: vle32.v v24, (a0) @@ -315,9 +315,9 @@ ; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV64-V128-NEXT: vle32.v v0, (a0) ; RV64-V128-NEXT: vmv8r.v v24, v8 -; RV64-V128-NEXT: vrgather.vv v8, v24, v0 ; RV64-V128-NEXT: addi a0, sp, 16 -; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: vrgather.vv v8, v24, v0 ; RV64-V128-NEXT: lui a0, %hi(.LCPI10_1) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_1) ; RV64-V128-NEXT: vle32.v v24, (a0) Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -368,9 +368,9 @@ ; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-V128-NEXT: vle32.v v0, (a0) ; RV32-V128-NEXT: vmv8r.v v24, v8 -; RV32-V128-NEXT: vrgather.vv v8, v24, v0 ; RV32-V128-NEXT: addi a0, sp, 16 -; RV32-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-V128-NEXT: vrgather.vv v8, v24, v0 ; RV32-V128-NEXT: lui a0, %hi(.LCPI15_1) ; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI15_1) ; RV32-V128-NEXT: vle32.v v24, (a0) @@ -419,9 +419,9 @@ ; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV64-V128-NEXT: vle32.v v0, (a0) ; RV64-V128-NEXT: vmv8r.v v24, v8 -; RV64-V128-NEXT: vrgather.vv v8, v24, v0 ; RV64-V128-NEXT: addi a0, sp, 16 -; RV64-V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV64-V128-NEXT: vrgather.vv v8, v24, v0 ; RV64-V128-NEXT: lui a0, %hi(.LCPI15_1) ; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI15_1) ; RV64-V128-NEXT: vle32.v v24, (a0) Index: llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -1563,11 +1563,11 @@ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v16, (a1) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v8, 16 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 @@ -1632,11 +1632,11 @@ ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vle32.v v16, (a1) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v8, 16 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: li a1, 24 ; RV64-NEXT: mul a0, a0, a1 @@ -1704,11 +1704,11 @@ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v16, (a1) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v8, 16 -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 @@ -1773,11 +1773,11 @@ ; RV64-NEXT: li a2, 32 ; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: vle32.v v16, (a1) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v8, 16 -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: li a1, 24 ; RV64-NEXT: mul a0, a0, a1 Index: llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll +++ llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll @@ -221,62 +221,62 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldr s23, [sp, #124] +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: vldr s23, [sp, #140] ; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vldr s22, [sp, #116] +; CHECK-NEXT: vldr s22, [sp, #132] ; CHECK-NEXT: vmov.f32 s25, s11 ; CHECK-NEXT: vmov.f32 s13, s10 -; CHECK-NEXT: vldr s19, [sp, #120] +; CHECK-NEXT: vldr s19, [sp, #136] ; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vldr s18, [sp, #112] +; CHECK-NEXT: vldr s18, [sp, #128] ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vldr s31, [sp, #172] +; CHECK-NEXT: vldr s31, [sp, #188] ; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vldr s30, [sp, #164] +; CHECK-NEXT: vldr s30, [sp, #180] ; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vldr s29, [sp, #156] +; CHECK-NEXT: vldr s29, [sp, #172] ; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vldr s28, [sp, #148] +; CHECK-NEXT: vldr s28, [sp, #164] ; CHECK-NEXT: vmov.f32 s4, s1 ; CHECK-NEXT: vmov.f32 s24, s9 ; CHECK-NEXT: vmov.f32 s16, s12 -; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s12, s8 -; CHECK-NEXT: vldr s27, [sp, #168] +; CHECK-NEXT: vldr s27, [sp, #184] ; CHECK-NEXT: vmov.f32 s17, s14 -; CHECK-NEXT: vldr s26, [sp, #160] +; CHECK-NEXT: vldr s26, [sp, #176] ; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vldr s25, [sp, #152] +; CHECK-NEXT: vldr s25, [sp, #168] ; CHECK-NEXT: vmov.f32 s8, s0 ; CHECK-NEXT: vmul.f32 q0, q5, q1 ; CHECK-NEXT: vmul.f32 q1, q4, q1 ; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vldr s24, [sp, #144] +; CHECK-NEXT: vldr s24, [sp, #160] ; CHECK-NEXT: vfma.f32 q1, q5, q2 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vsub.f32 q6, q6, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vldr s13, [sp, #140] +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldr s13, [sp, #156] ; CHECK-NEXT: vfma.f32 q1, q4, q2 -; CHECK-NEXT: vldr s12, [sp, #132] +; CHECK-NEXT: vldr s12, [sp, #148] ; CHECK-NEXT: vadd.f32 q1, q7, q1 -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldr s1, [sp, #136] -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldr s1, [sp, #152] +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill ; CHECK-NEXT: vmul.f32 q2, q3, q7 -; CHECK-NEXT: vldr s0, [sp, #128] -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldr s0, [sp, #144] +; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vneg.f32 q2, q2 -; CHECK-NEXT: vldr s21, [sp, #184] +; CHECK-NEXT: vldr s21, [sp, #200] ; CHECK-NEXT: vfma.f32 q2, q0, q3 ; CHECK-NEXT: vmul.f32 q0, q0, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldr s20, [sp, #176] -; CHECK-NEXT: vldr s17, [sp, #188] -; CHECK-NEXT: vldr s16, [sp, #180] +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vldr s20, [sp, #192] +; CHECK-NEXT: vldr s17, [sp, #204] +; CHECK-NEXT: vldr s16, [sp, #196] ; CHECK-NEXT: vfma.f32 q0, q7, q3 ; CHECK-NEXT: vsub.f32 q3, q5, q0 ; CHECK-NEXT: vmov.f32 s1, s4 @@ -291,7 +291,7 @@ ; CHECK-NEXT: vmov.f32 s9, s16 ; CHECK-NEXT: vmov.f32 s10, s13 ; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-vst3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -1014,14 +1014,13 @@ ; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s23, s3 +; CHECK-NEXT: vmov.f32 s0, s2 ; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov.f32 s1, s15 +; CHECK-NEXT: vmov.f32 s2, s11 ; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vmov.f32 s0, s12 ; CHECK-NEXT: vmov.f32 s1, s8 ; CHECK-NEXT: vmov.f32 s3, s13