diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1241,7 +1241,9 @@ llvm_unreachable("Unknown reg class!"); break; case 32: - if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { + if (ARM::QQPRRegClass.hasSubClassEq(RC) || + ARM::MQQPRRegClass.hasSubClassEq(RC) || + ARM::DQuadRegClass.hasSubClassEq(RC)) { if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) && Subtarget.hasNEON()) { // FIXME: It's possible to only store part of the QQ register if the @@ -1267,7 +1269,8 @@ llvm_unreachable("Unknown reg class!"); break; case 64: - if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) { + if (ARM::QQQQPRRegClass.hasSubClassEq(RC) || + ARM::MQQQQPRRegClass.hasSubClassEq(RC)) { MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::VSTMDIA)) .addFrameIndex(FI) .add(predOps(ARMCC::AL)) @@ -1473,31 +1476,34 @@ llvm_unreachable("Unknown reg class!"); break; case 32: - if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { - if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) && - Subtarget.hasNEON()) { - BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg) - .addFrameIndex(FI) - .addImm(16) - .addMemOperand(MMO) - .add(predOps(ARMCC::AL)); - } else { - MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) - .addFrameIndex(FI) - .add(predOps(ARMCC::AL)) - .addMemOperand(MMO); - MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); - if (Register::isPhysicalRegister(DestReg)) - MIB.addReg(DestReg, RegState::ImplicitDefine); - } - } else - llvm_unreachable("Unknown reg class!"); - break; + if (ARM::QQPRRegClass.hasSubClassEq(RC) || + ARM::MQQPRRegClass.hasSubClassEq(RC) || + ARM::DQuadRegClass.hasSubClassEq(RC)) { + if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) && + Subtarget.hasNEON()) { + BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg) + .addFrameIndex(FI) + .addImm(16) + .addMemOperand(MMO) + .add(predOps(ARMCC::AL)); + } else { + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) + .addFrameIndex(FI) + .add(predOps(ARMCC::AL)) + .addMemOperand(MMO); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); + if (Register::isPhysicalRegister(DestReg)) + MIB.addReg(DestReg, RegState::ImplicitDefine); + } + } else + llvm_unreachable("Unknown reg class!"); + break; case 64: - if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) { + if (ARM::QQQQPRRegClass.hasSubClassEq(RC) || + ARM::MQQQQPRRegClass.hasSubClassEq(RC)) { MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) .addFrameIndex(FI) .add(predOps(ARMCC::AL)) diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -263,6 +263,13 @@ case ARM::QQQQPRRegClassID: if (MF.getSubtarget().hasNEON()) return Super; + break; + case ARM::MQPRRegClassID: + case ARM::MQQPRRegClassID: + case ARM::MQQQQPRRegClassID: + if (MF.getSubtarget().hasMVEIntegerOps()) + return Super; + break; } Super = *I++; } while (Super); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1853,12 +1853,18 @@ // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive // MVE Q registers. - if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { + if (Subtarget->hasNEON()) { if (VT == MVT::v4i64) return &ARM::QQPRRegClass; if (VT == MVT::v8i64) return &ARM::QQQQPRRegClass; } + if (Subtarget->hasMVEIntegerOps()) { + if (VT == MVT::v4i64) + return &ARM::MQQPRRegClass; + if (VT == MVT::v8i64) + return &ARM::MQQQQPRRegClass; + } return TargetLowering::getRegClassFor(VT); } diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -97,7 +97,7 @@ "q-registers in range [q0,q7]"; } -def VecList2Q : RegisterOperand { +def VecList2Q : RegisterOperand { let ParserMatchClass = VecList2QAsmOperand; let PrintMethod = "printMVEVectorList<2>"; } @@ -110,7 +110,7 @@ "q-registers in range [q0,q7]"; } -def VecList4Q : RegisterOperand { +def VecList4Q : RegisterOperand { let ParserMatchClass = VecList4QAsmOperand; let PrintMethod = "printMVEVectorList<4>"; } @@ -6037,13 +6037,13 @@ def : Pat<(int_arm_mve_vst2q i32:$addr, (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)), (!cast("MVE_VST2"#stage#"_"#lanesize) - (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + (REG_SEQUENCE MQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), t2_addr_offset_none:$addr)>; foreach stage = [0,1] in def : Pat<(i32 (MVEVST2UPD i32:$addr, (i32 32), (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage))), (i32 (!cast("MVE_VST2"#stage#"_"#lanesize#_wb) - (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + (REG_SEQUENCE MQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), t2_addr_offset_none:$addr))>; foreach stage = [0,1,2,3] in @@ -6051,16 +6051,16 @@ (VT MQPR:$v0), (VT MQPR:$v1), (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)), (!cast("MVE_VST4"#stage#"_"#lanesize) - (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, - VT:$v2, qsub_2, VT:$v3, qsub_3), + (REG_SEQUENCE MQQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), t2_addr_offset_none:$addr)>; foreach stage = [0,1,2,3] in def : Pat<(i32 (MVEVST4UPD i32:$addr, (i32 64), (VT MQPR:$v0), (VT MQPR:$v1), (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage))), (i32 (!cast("MVE_VST4"#stage#"_"#lanesize#_wb) - (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, - VT:$v2, qsub_2, VT:$v3, qsub_3), + (REG_SEQUENCE MQQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), t2_addr_offset_none:$addr))>; } defm : MVE_vst24_patterns<8, v16i8>; diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td --- a/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -557,6 +557,9 @@ let AltOrderSelect = [{ return 1; }]; } +// Same as QQPR but for MVE, containing the 7 register pairs made up from Q0-Q7. +def MQQPR : RegisterClass<"ARM", [v4i64], 256, (trunc QQPR, 7)>; + // Tuples of 4 D regs that isn't also a pair of Q regs. def TuplesOE4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3], [(decimate (shl DPR, 1), 2), @@ -580,6 +583,9 @@ let AltOrderSelect = [{ return 1; }]; } +// Same as QQPR but for MVE, containing the 5 register quads made up from Q0-Q7. +def MQQQQPR : RegisterClass<"ARM", [v8i64], 256, (trunc QQQQPR, 5)>; + // Pseudo-registers representing 2-spaced consecutive D registers. def Tuples2DSpc : RegisterTuples<[dsub_0, dsub_2], diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -3343,16 +3343,16 @@ // regs) or q0-q4 (for 4) // // The MVE instructions taking a register range of this kind will - // need an operand in the QQPR or QQQQPR class, representing the + // need an operand in the MQQPR or MQQQQPR class, representing the // entire range as a unit. So we must translate into that class, // by finding the index of the base register in the MQPR reg // class, and returning the super-register at the corresponding // index in the target class. const MCRegisterClass *RC_in = &ARMMCRegisterClasses[ARM::MQPRRegClassID]; - const MCRegisterClass *RC_out = (VectorList.Count == 2) ? - &ARMMCRegisterClasses[ARM::QQPRRegClassID] : - &ARMMCRegisterClasses[ARM::QQQQPRRegClassID]; + const MCRegisterClass *RC_out = + (VectorList.Count == 2) ? &ARMMCRegisterClasses[ARM::MQQPRRegClassID] + : &ARMMCRegisterClasses[ARM::MQQQQPRRegClassID]; unsigned I, E = RC_out->getNumRegs(); for (I = 0; I < E; I++) diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp --- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -227,10 +227,12 @@ uint64_t Address, const void *Decoder); static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, const void *Decoder); -static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst, @@ -6154,9 +6156,9 @@ ARM::Q4_Q5, ARM::Q5_Q6, ARM::Q6_Q7 }; -static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder) { +static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { if (RegNo > 6) return MCDisassembler::Fail; @@ -6170,9 +6172,9 @@ ARM::Q3_Q4_Q5_Q6, ARM::Q4_Q5_Q6_Q7 }; -static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder) { +static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { if (RegNo > 4) return MCDisassembler::Fail; diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -77,8 +77,9 @@ ; CHECK-NEXT: vld20.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vld20.32 {q1, q2}, [r3] -; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4 ; CHECK-NEXT: vld21.32 {q5, q6}, [r0] +; CHECK-NEXT: vadd.i32 q3, q3, q4 ; CHECK-NEXT: vld21.32 {q1, q2}, [r3] ; CHECK-NEXT: vstrw.32 q3, [r1, #48] ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 @@ -479,8 +480,9 @@ ; CHECK-NEXT: vld20.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vld20.32 {q1, q2}, [r3] -; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4 ; CHECK-NEXT: vld21.32 {q5, q6}, [r0] +; CHECK-NEXT: vadd.f32 q3, q3, q4 ; CHECK-NEXT: vld21.32 {q1, q2}, [r3] ; CHECK-NEXT: vstrw.32 q3, [r1, #48] ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -73,27 +73,27 @@ define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vld4_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q6, q2, q3 +; CHECK-NEXT: vadd.i32 q4, q2, q3 +; CHECK-NEXT: vadd.i32 q5, q0, q1 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i32 q5, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vadd.i32 q0, q0, q6 -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vadd.i32 q5, q3, q4 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vadd.i32 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i32>, <32 x i32>* %src, align 4 @@ -126,50 +126,46 @@ ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.i32 q4, q2, q3 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q6, q0, q1 ; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vadd.i32 q6, q5, q6 -; CHECK-NEXT: vstrw.32 q6, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vadd.i32 q2, q6, q2 +; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q2, q3, q4 ; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vadd.i32 q1, q3, q5 -; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q1, q3, q1 ; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 -; CHECK-NEXT: vadd.i32 q2, q3, q4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 ; CHECK-NEXT: vadd.i32 q1, q5, q6 +; CHECK-NEXT: vadd.i32 q2, q3, q4 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vadd.i32 q1, q2, q1 ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #112 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} @@ -349,27 +345,27 @@ define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vld4_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i16 q6, q2, q3 +; CHECK-NEXT: vadd.i16 q4, q2, q3 +; CHECK-NEXT: vadd.i16 q5, q0, q1 +; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.i16 q4, q5, q4 +; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i16 q5, q2, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vld40.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vadd.i16 q0, q0, q6 -; CHECK-NEXT: vld41.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vadd.i16 q5, q3, q4 -; CHECK-NEXT: vadd.i16 q1, q1, q2 -; CHECK-NEXT: vadd.i16 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vadd.i16 q0, q0, q5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i16>, <64 x i16>* %src, align 2 @@ -871,27 +867,27 @@ define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vld4_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q6, q2, q3 +; CHECK-NEXT: vadd.f32 q4, q2, q3 +; CHECK-NEXT: vadd.f32 q5, q0, q1 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.f32 q5, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vadd.f32 q0, q0, q6 -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vadd.f32 q5, q3, q4 -; CHECK-NEXT: vadd.f32 q1, q1, q2 -; CHECK-NEXT: vadd.f32 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vadd.f32 q0, q0, q5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x float>, <32 x float>* %src, align 4 @@ -924,50 +920,46 @@ ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.f32 q4, q2, q3 -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vadd.f32 q6, q0, q1 ; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vadd.f32 q6, q5, q6 -; CHECK-NEXT: vstrw.32 q6, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 q0, q1, q0 +; CHECK-NEXT: vadd.f32 q2, q6, q2 +; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vadd.f32 q2, q3, q4 ; CHECK-NEXT: vadd.f32 q0, q0, q2 -; CHECK-NEXT: vadd.f32 q1, q3, q5 -; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 q0, q0, q5 +; CHECK-NEXT: vadd.f32 q0, q0, q2 +; CHECK-NEXT: vadd.f32 q1, q3, q1 ; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload -; CHECK-NEXT: vadd.f32 q1, q2, q1 -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 -; CHECK-NEXT: vadd.f32 q2, q3, q4 ; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 ; CHECK-NEXT: vadd.f32 q1, q5, q6 +; CHECK-NEXT: vadd.f32 q2, q3, q4 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vadd.f32 q1, q2, q1 ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #112 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}