Index: llvm/lib/Target/ARM/ARMBaseRegisterInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMBaseRegisterInfo.h +++ llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -209,6 +209,11 @@ unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override; + + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const; }; } // end namespace llvm Index: llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -909,3 +909,17 @@ } return false; } + +bool ARMBaseRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // We can't extract an SPR from an arbitary DPR (as opposed to a DPR_VFP2). + if (DefRC == &ARM::SPRRegClass && DefSubReg == 0 && + SrcRC == &ARM::DPRRegClass && + (SrcSubReg == ARM::ssub_0 || SrcSubReg == ARM::ssub_1)) + return false; + + return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg, + SrcRC, SrcSubReg); +} \ No newline at end of file Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14169,6 +14169,69 @@ return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); } +// Convert a pair of extracts from the same base vector to a VMOVRRD. Either +// directly or bitcast to an integer if the original is a float vector. +// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2) +// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2) +static SDValue +PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + SDLoc dl(N); + + if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32) + return SDValue(); + + SDValue Ext = SDValue(N, 0); + if (Ext.getOpcode() == ISD::BITCAST && + Ext.getOperand(0).getValueType() == MVT::f32) + Ext = Ext.getOperand(0); + if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(Ext.getOperand(1)) || + Ext.getConstantOperandVal(1) % 2 != 0) + return SDValue(); + if (Ext->use_size() == 1 && + (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP || + Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP)) + return SDValue(); + + SDValue Op0 = Ext.getOperand(0); + EVT VecVT = Op0.getValueType(); + unsigned Lane = Ext.getConstantOperandVal(1); + if (VecVT.getVectorNumElements() != 4) + return SDValue(); + + // Find another other extract, of Lane + 1 + auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) { + return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa(V->getOperand(1)) && + V->getConstantOperandVal(1) == Lane + 1; + }); + if (OtherIt == Op0->uses().end()) + return SDValue(); + + // For float extracts, we need to be converting to a i32 for both vector + // lanes. + SDValue OtherExt(*OtherIt, 0); + if (OtherExt.getValueType() != MVT::i32) { + if (OtherExt->use_size() != 1 || + OtherExt->use_begin()->getOpcode() != ISD::BITCAST || + OtherExt->use_begin()->getValueType(0) != MVT::i32) + return SDValue(); + OtherExt = SDValue(*OtherExt->use_begin(), 0); + } + + // Convert the type to a f64 and extract with a VMOVRRD. + SDValue F64 = DCI.DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0), + DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32)); + SDValue VMOVRRD = + DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64); + + DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1)); + return VMOVRRD; +} + static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { @@ -14214,6 +14277,10 @@ DCI.DAG.getConstant(SubIdx, dl, MVT::i32)); } + // extract x, n; extract x, n+1 -> VMOVRRD x + if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) + return R; + return SDValue(); } @@ -16691,8 +16758,10 @@ return Res; } -static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { +static SDValue PerformBITCASTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *ST) { + SelectionDAG &DAG = DCI.DAG; SDValue Src = N->getOperand(0); EVT DstVT = N->getValueType(0); @@ -16718,6 +16787,10 @@ DAG.getDataLayout().isBigEndian()) return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src); + // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x + if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) + return R; + return SDValue(); } @@ -17023,7 +17096,7 @@ case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); case ISD::BITCAST: - return PerformBITCASTCombine(N, DCI.DAG, Subtarget); + return PerformBITCASTCombine(N, DCI, Subtarget); case ARMISD::PREDICATE_CAST: return PerformPREDICATE_CASTCombine(N, DCI); case ARMISD::VECTOR_REG_CAST: Index: llvm/test/CodeGen/ARM/addsubo-legalization.ll =================================================================== --- llvm/test/CodeGen/ARM/addsubo-legalization.ll +++ llvm/test/CodeGen/ARM/addsubo-legalization.ll @@ -12,26 +12,22 @@ ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmov r3, r2, d18 ; CHECK-NEXT: vadd.i64 q8, q9, q8 -; CHECK-NEXT: vmov.32 r3, d18[0] -; CHECK-NEXT: vmov.32 r2, d18[1] -; CHECK-NEXT: vmov.32 r12, d16[0] -; CHECK-NEXT: vmov.32 lr, d16[1] -; CHECK-NEXT: vmov.32 r4, d17[0] -; CHECK-NEXT: vmov.32 r5, d19[0] -; CHECK-NEXT: vmov.32 r6, d17[1] -; CHECK-NEXT: vmov.32 r7, d19[1] -; CHECK-NEXT: subs.w r3, r12, r3 -; CHECK-NEXT: sbcs.w r2, lr, r2 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmov r6, r7, d19 +; CHECK-NEXT: vmov lr, r12, d16 +; CHECK-NEXT: vmov r4, r5, d17 +; CHECK-NEXT: subs.w r3, lr, r3 +; CHECK-NEXT: sbcs.w r2, r12, r2 ; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r2, #-1 -; CHECK-NEXT: subs r3, r4, r5 -; CHECK-NEXT: sbcs.w r3, r6, r7 +; CHECK-NEXT: subs r3, r4, r6 +; CHECK-NEXT: sbcs.w r3, r5, r7 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r1, #1 ; CHECK-NEXT: cmp r1, #0 @@ -57,24 +53,20 @@ ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] ; CHECK-NEXT: vsub.i64 q8, q9, q8 -; CHECK-NEXT: vmov.32 r12, d18[0] -; CHECK-NEXT: vmov.32 lr, d18[1] -; CHECK-NEXT: vmov.32 r3, d16[0] -; CHECK-NEXT: vmov.32 r2, d16[1] -; CHECK-NEXT: vmov.32 r4, d19[0] -; CHECK-NEXT: vmov.32 r5, d17[0] -; CHECK-NEXT: vmov.32 r6, d19[1] -; CHECK-NEXT: vmov.32 r7, d17[1] -; CHECK-NEXT: subs.w r3, r12, r3 -; CHECK-NEXT: sbcs.w r2, lr, r2 +; CHECK-NEXT: vmov lr, r12, d18 +; CHECK-NEXT: vmov r4, r5, d19 +; CHECK-NEXT: vmov r3, r2, d16 +; CHECK-NEXT: vmov r6, r7, d17 +; CHECK-NEXT: subs.w r3, lr, r3 +; CHECK-NEXT: sbcs.w r2, r12, r2 ; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r2, #-1 -; CHECK-NEXT: subs r3, r4, r5 -; CHECK-NEXT: sbcs.w r3, r6, r7 +; CHECK-NEXT: subs r3, r4, r6 +; CHECK-NEXT: sbcs.w r3, r5, r7 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r1, #1 ; CHECK-NEXT: cmp r1, #0 Index: llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll =================================================================== --- llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll +++ llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll @@ -492,24 +492,23 @@ define void @conv_v8f16_to_i128( <8 x half> %a, i128* %store ) { ; CHECK-LABEL: conv_v8f16_to_i128: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: adr r1, .LCPI18_0 ; CHECK-NEXT: vrev64.16 q9, q0 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128] ; CHECK-NEXT: vrev64.16 q8, q8 ; CHECK-NEXT: vadd.f16 q8, q9, q8 ; CHECK-NEXT: vrev32.16 q8, q8 -; CHECK-NEXT: vmov.32 r12, d17[1] -; CHECK-NEXT: vmov.32 r2, d17[0] -; CHECK-NEXT: vmov.32 r3, d16[1] -; CHECK-NEXT: vmov.32 r1, d16[0] -; CHECK-NEXT: subs r12, r12, #1 -; CHECK-NEXT: str r12, [r0, #12] -; CHECK-NEXT: sbcs r2, r2, #0 -; CHECK-NEXT: str r2, [r0, #8] -; CHECK-NEXT: sbcs r3, r3, #0 -; CHECK-NEXT: sbc r1, r1, #0 -; CHECK-NEXT: stm r0, {r1, r3} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r12, r2, d17 +; CHECK-NEXT: vmov r3, r1, d16 +; CHECK-NEXT: subs lr, r2, #1 +; CHECK-NEXT: sbcs r2, r12, #0 +; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: sbc r3, r3, #0 +; CHECK-NEXT: str r3, [r0] +; CHECK-NEXT: stmib r0, {r1, r2, lr} +; CHECK-NEXT: pop {r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI18_0: Index: llvm/test/CodeGen/ARM/big-endian-vector-callee.ll =================================================================== --- llvm/test/CodeGen/ARM/big-endian-vector-callee.ll +++ llvm/test/CodeGen/ARM/big-endian-vector-callee.ll @@ -1050,21 +1050,13 @@ ; SOFT-NEXT: .pad #16 ; SOFT-NEXT: sub sp, sp, #16 ; SOFT-NEXT: vmov d16, r3, r2 -; SOFT-NEXT: add r12, sp, #12 ; SOFT-NEXT: vmov d17, r1, r0 ; SOFT-NEXT: vadd.f64 d19, d16, d16 ; SOFT-NEXT: vadd.f64 d18, d17, d17 ; SOFT-NEXT: vrev64.32 q8, q9 -; SOFT-NEXT: vmov.32 r0, d16[0] -; SOFT-NEXT: vst1.32 {d17[1]}, [r12:32] -; SOFT-NEXT: add r12, sp, #8 -; SOFT-NEXT: vst1.32 {d16[0]}, [sp:32] -; SOFT-NEXT: vst1.32 {d17[0]}, [r12:32] -; SOFT-NEXT: add r12, sp, #4 -; SOFT-NEXT: vst1.32 {d16[1]}, [r12:32] -; SOFT-NEXT: vmov.32 r1, d16[1] -; SOFT-NEXT: vmov.32 r2, d17[0] -; SOFT-NEXT: vmov.32 r3, d17[1] +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: stm sp, {r0, r1, r2, r3} ; SOFT-NEXT: bl __addtf3 ; SOFT-NEXT: add sp, sp, #16 ; SOFT-NEXT: pop {r11, pc} @@ -1076,19 +1068,11 @@ ; HARD-NEXT: .pad #16 ; HARD-NEXT: sub sp, sp, #16 ; HARD-NEXT: vadd.f64 d17, d1, d1 -; HARD-NEXT: add r12, sp, #12 ; HARD-NEXT: vadd.f64 d16, d0, d0 ; HARD-NEXT: vrev64.32 q8, q8 -; HARD-NEXT: vmov.32 r0, d16[0] -; HARD-NEXT: vst1.32 {d17[1]}, [r12:32] -; HARD-NEXT: add r12, sp, #8 -; HARD-NEXT: vst1.32 {d16[0]}, [sp:32] -; HARD-NEXT: vst1.32 {d17[0]}, [r12:32] -; HARD-NEXT: add r12, sp, #4 -; HARD-NEXT: vst1.32 {d16[1]}, [r12:32] -; HARD-NEXT: vmov.32 r1, d16[1] -; HARD-NEXT: vmov.32 r2, d17[0] -; HARD-NEXT: vmov.32 r3, d17[1] +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: stm sp, {r0, r1, r2, r3} ; HARD-NEXT: bl __addtf3 ; HARD-NEXT: add sp, sp, #16 ; HARD-NEXT: pop {r11, pc} @@ -1106,20 +1090,12 @@ ; SOFT-NEXT: .pad #16 ; SOFT-NEXT: sub sp, sp, #16 ; SOFT-NEXT: vmov d17, r3, r2 -; SOFT-NEXT: add r12, sp, #12 ; SOFT-NEXT: vmov d16, r1, r0 ; SOFT-NEXT: vadd.i64 q8, q8, q8 ; SOFT-NEXT: vrev64.32 q8, q8 -; SOFT-NEXT: vmov.32 r0, d16[0] -; SOFT-NEXT: vst1.32 {d17[1]}, [r12:32] -; SOFT-NEXT: add r12, sp, #8 -; SOFT-NEXT: vst1.32 {d16[0]}, [sp:32] -; SOFT-NEXT: vst1.32 {d17[0]}, [r12:32] -; SOFT-NEXT: add r12, sp, #4 -; SOFT-NEXT: vst1.32 {d16[1]}, [r12:32] -; SOFT-NEXT: vmov.32 r1, d16[1] -; SOFT-NEXT: vmov.32 r2, d17[0] -; SOFT-NEXT: vmov.32 r3, d17[1] +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: stm sp, {r0, r1, r2, r3} ; SOFT-NEXT: bl __addtf3 ; SOFT-NEXT: add sp, sp, #16 ; SOFT-NEXT: pop {r11, pc} @@ -1131,18 +1107,10 @@ ; HARD-NEXT: .pad #16 ; HARD-NEXT: sub sp, sp, #16 ; HARD-NEXT: vadd.i64 q8, q0, q0 -; HARD-NEXT: add r12, sp, #12 ; HARD-NEXT: vrev64.32 q8, q8 -; HARD-NEXT: vmov.32 r0, d16[0] -; HARD-NEXT: vst1.32 {d17[1]}, [r12:32] -; HARD-NEXT: add r12, sp, #8 -; HARD-NEXT: vst1.32 {d16[0]}, [sp:32] -; HARD-NEXT: vst1.32 {d17[0]}, [r12:32] -; HARD-NEXT: add r12, sp, #4 -; HARD-NEXT: vst1.32 {d16[1]}, [r12:32] -; HARD-NEXT: vmov.32 r1, d16[1] -; HARD-NEXT: vmov.32 r2, d17[0] -; HARD-NEXT: vmov.32 r3, d17[1] +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: stm sp, {r0, r1, r2, r3} ; HARD-NEXT: bl __addtf3 ; HARD-NEXT: add sp, sp, #16 ; HARD-NEXT: pop {r11, pc} @@ -1160,20 +1128,12 @@ ; SOFT-NEXT: .pad #16 ; SOFT-NEXT: sub sp, sp, #16 ; SOFT-NEXT: vmov d17, r3, r2 -; SOFT-NEXT: add r12, sp, #12 ; SOFT-NEXT: vmov d16, r1, r0 ; SOFT-NEXT: vrev64.32 q8, q8 ; SOFT-NEXT: vadd.f32 q8, q8, q8 -; SOFT-NEXT: vmov.32 r0, d16[0] -; SOFT-NEXT: vst1.32 {d17[1]}, [r12:32] -; SOFT-NEXT: add r12, sp, #8 -; SOFT-NEXT: vst1.32 {d16[0]}, [sp:32] -; SOFT-NEXT: vst1.32 {d17[0]}, [r12:32] -; SOFT-NEXT: add r12, sp, #4 -; SOFT-NEXT: vst1.32 {d16[1]}, [r12:32] -; SOFT-NEXT: vmov.32 r1, d16[1] -; SOFT-NEXT: vmov.32 r2, d17[0] -; SOFT-NEXT: vmov.32 r3, d17[1] +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: stm sp, {r0, r1, r2, r3} ; SOFT-NEXT: bl __addtf3 ; SOFT-NEXT: add sp, sp, #16 ; SOFT-NEXT: pop {r11, pc} @@ -1185,18 +1145,10 @@ ; HARD-NEXT: .pad #16 ; HARD-NEXT: sub sp, sp, #16 ; HARD-NEXT: vrev64.32 q8, q0 -; HARD-NEXT: add r12, sp, #12 ; HARD-NEXT: vadd.f32 q8, q8, q8 -; HARD-NEXT: vmov.32 r0, d16[0] -; HARD-NEXT: vst1.32 {d17[1]}, [r12:32] -; HARD-NEXT: add r12, sp, #8 -; HARD-NEXT: vst1.32 {d16[0]}, [sp:32] -; HARD-NEXT: vst1.32 {d17[0]}, [r12:32] -; HARD-NEXT: add r12, sp, #4 -; HARD-NEXT: vst1.32 {d16[1]}, [r12:32] -; HARD-NEXT: vmov.32 r1, d16[1] -; HARD-NEXT: vmov.32 r2, d17[0] -; HARD-NEXT: vmov.32 r3, d17[1] +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: stm sp, {r0, r1, r2, r3} ; HARD-NEXT: bl __addtf3 ; HARD-NEXT: add sp, sp, #16 ; HARD-NEXT: pop {r11, pc} @@ -1214,20 +1166,12 @@ ; SOFT-NEXT: .pad #16 ; SOFT-NEXT: sub sp, sp, #16 ; SOFT-NEXT: vmov d17, r3, r2 -; SOFT-NEXT: add r12, sp, #12 ; SOFT-NEXT: vmov d16, r1, r0 ; SOFT-NEXT: vrev64.32 q8, q8 ; SOFT-NEXT: vadd.i32 q8, q8, q8 -; SOFT-NEXT: vmov.32 r0, d16[0] -; SOFT-NEXT: vst1.32 {d17[1]}, [r12:32] -; SOFT-NEXT: add r12, sp, #8 -; SOFT-NEXT: vst1.32 {d16[0]}, [sp:32] -; SOFT-NEXT: vst1.32 {d17[0]}, [r12:32] -; SOFT-NEXT: add r12, sp, #4 -; SOFT-NEXT: vst1.32 {d16[1]}, [r12:32] -; SOFT-NEXT: vmov.32 r1, d16[1] -; SOFT-NEXT: vmov.32 r2, d17[0] -; SOFT-NEXT: vmov.32 r3, d17[1] +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: stm sp, {r0, r1, r2, r3} ; SOFT-NEXT: bl __addtf3 ; SOFT-NEXT: add sp, sp, #16 ; SOFT-NEXT: pop {r11, pc} @@ -1239,18 +1183,10 @@ ; HARD-NEXT: .pad #16 ; HARD-NEXT: sub sp, sp, #16 ; HARD-NEXT: vrev64.32 q8, q0 -; HARD-NEXT: add r12, sp, #12 ; HARD-NEXT: vadd.i32 q8, q8, q8 -; HARD-NEXT: vmov.32 r0, d16[0] -; HARD-NEXT: vst1.32 {d17[1]}, [r12:32] -; HARD-NEXT: add r12, sp, #8 -; HARD-NEXT: vst1.32 {d16[0]}, [sp:32] -; HARD-NEXT: vst1.32 {d17[0]}, [r12:32] -; HARD-NEXT: add r12, sp, #4 -; HARD-NEXT: vst1.32 {d16[1]}, [r12:32] -; HARD-NEXT: vmov.32 r1, d16[1] -; HARD-NEXT: vmov.32 r2, d17[0] -; HARD-NEXT: vmov.32 r3, d17[1] +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: stm sp, {r0, r1, r2, r3} ; HARD-NEXT: bl __addtf3 ; HARD-NEXT: add sp, sp, #16 ; HARD-NEXT: pop {r11, pc} @@ -1268,21 +1204,13 @@ ; SOFT-NEXT: .pad #16 ; SOFT-NEXT: sub sp, sp, #16 ; SOFT-NEXT: vmov d17, r3, r2 -; SOFT-NEXT: add r12, sp, #12 ; SOFT-NEXT: vmov d16, r1, r0 ; SOFT-NEXT: vrev64.16 q8, q8 ; SOFT-NEXT: vadd.i16 q8, q8, q8 ; SOFT-NEXT: vrev32.16 q8, q8 -; SOFT-NEXT: vmov.32 r0, d16[0] -; SOFT-NEXT: vst1.32 {d17[1]}, [r12:32] -; SOFT-NEXT: add r12, sp, #8 -; SOFT-NEXT: vst1.32 {d16[0]}, [sp:32] -; SOFT-NEXT: vst1.32 {d17[0]}, [r12:32] -; SOFT-NEXT: add r12, sp, #4 -; SOFT-NEXT: vst1.32 {d16[1]}, [r12:32] -; SOFT-NEXT: vmov.32 r1, d16[1] -; SOFT-NEXT: vmov.32 r2, d17[0] -; SOFT-NEXT: vmov.32 r3, d17[1] +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: stm sp, {r0, r1, r2, r3} ; SOFT-NEXT: bl __addtf3 ; SOFT-NEXT: add sp, sp, #16 ; SOFT-NEXT: pop {r11, pc} @@ -1294,19 +1222,11 @@ ; HARD-NEXT: .pad #16 ; HARD-NEXT: sub sp, sp, #16 ; HARD-NEXT: vrev64.16 q8, q0 -; HARD-NEXT: add r12, sp, #12 ; HARD-NEXT: vadd.i16 q8, q8, q8 ; HARD-NEXT: vrev32.16 q8, q8 -; HARD-NEXT: vmov.32 r0, d16[0] -; HARD-NEXT: vst1.32 {d17[1]}, [r12:32] -; HARD-NEXT: add r12, sp, #8 -; HARD-NEXT: vst1.32 {d16[0]}, [sp:32] -; HARD-NEXT: vst1.32 {d17[0]}, [r12:32] -; HARD-NEXT: add r12, sp, #4 -; HARD-NEXT: vst1.32 {d16[1]}, [r12:32] -; HARD-NEXT: vmov.32 r1, d16[1] -; HARD-NEXT: vmov.32 r2, d17[0] -; HARD-NEXT: vmov.32 r3, d17[1] +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: stm sp, {r0, r1, r2, r3} ; HARD-NEXT: bl __addtf3 ; HARD-NEXT: add sp, sp, #16 ; HARD-NEXT: pop {r11, pc} @@ -1324,21 +1244,13 @@ ; SOFT-NEXT: .pad #16 ; SOFT-NEXT: sub sp, sp, #16 ; SOFT-NEXT: vmov d17, r3, r2 -; SOFT-NEXT: add r12, sp, #12 ; SOFT-NEXT: vmov d16, r1, r0 ; SOFT-NEXT: vrev64.8 q8, q8 ; SOFT-NEXT: vadd.i8 q8, q8, q8 ; SOFT-NEXT: vrev32.8 q8, q8 -; SOFT-NEXT: vmov.32 r0, d16[0] -; SOFT-NEXT: vst1.32 {d17[1]}, [r12:32] -; SOFT-NEXT: add r12, sp, #8 -; SOFT-NEXT: vst1.32 {d16[0]}, [sp:32] -; SOFT-NEXT: vst1.32 {d17[0]}, [r12:32] -; SOFT-NEXT: add r12, sp, #4 -; SOFT-NEXT: vst1.32 {d16[1]}, [r12:32] -; SOFT-NEXT: vmov.32 r1, d16[1] -; SOFT-NEXT: vmov.32 r2, d17[0] -; SOFT-NEXT: vmov.32 r3, d17[1] +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: stm sp, {r0, r1, r2, r3} ; SOFT-NEXT: bl __addtf3 ; SOFT-NEXT: add sp, sp, #16 ; SOFT-NEXT: pop {r11, pc} @@ -1350,19 +1262,11 @@ ; HARD-NEXT: .pad #16 ; HARD-NEXT: sub sp, sp, #16 ; HARD-NEXT: vrev64.8 q8, q0 -; HARD-NEXT: add r12, sp, #12 ; HARD-NEXT: vadd.i8 q8, q8, q8 ; HARD-NEXT: vrev32.8 q8, q8 -; HARD-NEXT: vmov.32 r0, d16[0] -; HARD-NEXT: vst1.32 {d17[1]}, [r12:32] -; HARD-NEXT: add r12, sp, #8 -; HARD-NEXT: vst1.32 {d16[0]}, [sp:32] -; HARD-NEXT: vst1.32 {d17[0]}, [r12:32] -; HARD-NEXT: add r12, sp, #4 -; HARD-NEXT: vst1.32 {d16[1]}, [r12:32] -; HARD-NEXT: vmov.32 r1, d16[1] -; HARD-NEXT: vmov.32 r2, d17[0] -; HARD-NEXT: vmov.32 r3, d17[1] +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: stm sp, {r0, r1, r2, r3} ; HARD-NEXT: bl __addtf3 ; HARD-NEXT: add sp, sp, #16 ; HARD-NEXT: pop {r11, pc} Index: llvm/test/CodeGen/ARM/combine-vmovdrr.ll =================================================================== --- llvm/test/CodeGen/ARM/combine-vmovdrr.ll +++ llvm/test/CodeGen/ARM/combine-vmovdrr.ll @@ -75,10 +75,8 @@ ; CHECK-LABEL: severalUses: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d16, d17}, [r0] -; CHECK-NEXT: vmov.32 r2, d16[1] -; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: vmov r0, r2, d16 ; CHECK-NEXT: vldr d18, [r1] -; CHECK-NEXT: vmov d16, r0, r2 ; CHECK-NEXT: vtbl.8 d16, {d16, d17}, d18 ; CHECK-NEXT: vstr d16, [r1] ; CHECK-NEXT: mov r1, r2 Index: llvm/test/CodeGen/ARM/vselect_imax.ll =================================================================== --- llvm/test/CodeGen/ARM/vselect_imax.ll +++ llvm/test/CodeGen/ARM/vselect_imax.ll @@ -72,55 +72,47 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-NEXT: vld1.64 {d22, d23}, [r0:128]! ; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]! +; CHECK-NEXT: vld1.64 {d22, d23}, [r0:128]! +; CHECK-NEXT: vmov r4, r6, d16 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128] -; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128] -; CHECK-NEXT: vmov.32 r12, d18[0] -; CHECK-NEXT: vmov.32 r2, d20[0] -; CHECK-NEXT: vmov.32 lr, d18[1] -; CHECK-NEXT: vmov.32 r0, d20[1] -; CHECK-NEXT: vmov.32 r7, d16[0] -; CHECK-NEXT: vmov.32 r5, d22[0] -; CHECK-NEXT: vmov.32 r4, d22[1] -; CHECK-NEXT: vmov.32 r6, d19[0] -; CHECK-NEXT: subs r2, r2, r12 -; CHECK-NEXT: vmov.32 r2, d16[1] -; CHECK-NEXT: sbcs r0, r0, lr +; CHECK-NEXT: vmov lr, r12, d18 ; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmov r2, r1, d20 +; CHECK-NEXT: subs r2, r2, lr +; CHECK-NEXT: vmov r7, lr, d17 +; CHECK-NEXT: vmov r2, r5, d22 +; CHECK-NEXT: sbcs r1, r1, r12 +; CHECK-NEXT: mov r1, #0 +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mvnne r1, #0 +; CHECK-NEXT: subs r2, r2, r4 +; CHECK-NEXT: sbcs r6, r5, r6 +; CHECK-NEXT: vmov r2, r12, d19 +; CHECK-NEXT: vmov r5, r4, d21 +; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: mvnne r6, #0 +; CHECK-NEXT: subs r2, r5, r2 +; CHECK-NEXT: sbcs r4, r4, r12 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: vmov r4, r5, d23 +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: subs r7, r4, r7 +; CHECK-NEXT: sbcs r7, r5, lr ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: subs r7, r5, r7 -; CHECK-NEXT: vmov.32 r7, d21[0] -; CHECK-NEXT: vmov.32 r5, d19[1] -; CHECK-NEXT: sbcs r2, r4, r2 -; CHECK-NEXT: vmov.32 r4, d21[1] -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vdup.32 d25, r0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: subs r7, r7, r6 -; CHECK-NEXT: vmov.32 r6, d23[0] -; CHECK-NEXT: vmov.32 r7, d17[0] -; CHECK-NEXT: sbcs r5, r4, r5 -; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: vmov.32 r5, d17[1] -; CHECK-NEXT: subs r7, r6, r7 -; CHECK-NEXT: vmov.32 r7, d23[1] -; CHECK-NEXT: sbcs r7, r7, r5 -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: vdup.32 d25, r1 -; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d24, r2 -; CHECK-NEXT: vdup.32 d27, r4 +; CHECK-NEXT: vdup.32 d24, r6 +; CHECK-NEXT: vdup.32 d27, r2 ; CHECK-NEXT: vbit q8, q11, q12 -; CHECK-NEXT: vdup.32 d26, r0 +; CHECK-NEXT: vdup.32 d26, r1 ; CHECK-NEXT: vbit q9, q10, q13 ; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128]! ; CHECK-NEXT: vst1.64 {d18, d19}, [r3:128] @@ -142,111 +134,98 @@ %T1_19* %blend, %T0_19* %storeaddr) { ; CHECK-LABEL: func_blend19: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-NEXT: add r2, r1, #48 -; CHECK-NEXT: add r5, r1, #32 +; CHECK-NEXT: mov r8, #0 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2:128] ; CHECK-NEXT: add r2, r0, #48 -; CHECK-NEXT: add r6, r0, #32 -; CHECK-NEXT: mov r7, #0 +; CHECK-NEXT: mov lr, #0 ; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128] -; CHECK-NEXT: vmov.32 r12, d16[0] -; CHECK-NEXT: vmov.32 r2, d18[0] -; CHECK-NEXT: vmov.32 lr, d16[1] -; CHECK-NEXT: vmov.32 r4, d18[1] -; CHECK-NEXT: vld1.64 {d28, d29}, [r0:128]! -; CHECK-NEXT: vld1.64 {d26, d27}, [r5:128] -; CHECK-NEXT: vld1.64 {d30, d31}, [r6:128] -; CHECK-NEXT: vmov.32 r5, d17[0] -; CHECK-NEXT: vld1.64 {d22, d23}, [r0:128] -; CHECK-NEXT: vmov.32 r0, d17[1] -; CHECK-NEXT: vld1.64 {d24, d25}, [r1:128]! -; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128] -; CHECK-NEXT: mov r1, #0 -; CHECK-NEXT: subs r2, r2, r12 +; CHECK-NEXT: vmov r2, r12, d16 +; CHECK-NEXT: vmov r6, r7, d17 +; CHECK-NEXT: vmov r4, r5, d18 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sbcs r2, r5, r12 ; CHECK-NEXT: mov r12, #0 -; CHECK-NEXT: vmov.32 r2, d19[0] -; CHECK-NEXT: sbcs r6, r4, lr -; CHECK-NEXT: vmov.32 r4, d24[0] -; CHECK-NEXT: vmov.32 r6, d19[1] +; CHECK-NEXT: vmov r2, r4, d19 ; CHECK-NEXT: movlt r12, #1 ; CHECK-NEXT: cmp r12, #0 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: mvnne r12, #0 -; CHECK-NEXT: subs r2, r2, r5 -; CHECK-NEXT: vmov.32 r5, d28[0] +; CHECK-NEXT: vld1.64 {d24, d25}, [r5:128]! +; CHECK-NEXT: vld1.64 {d20, d21}, [r5:128] +; CHECK-NEXT: subs r2, r2, r6 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: add r0, r0, #32 +; CHECK-NEXT: vld1.64 {d26, d27}, [r2:128]! +; CHECK-NEXT: vld1.64 {d22, d23}, [r2:128] +; CHECK-NEXT: sbcs r2, r4, r7 +; CHECK-NEXT: vmov r4, r5, d21 +; CHECK-NEXT: movlt r8, #1 +; CHECK-NEXT: vmov r6, r7, d23 +; CHECK-NEXT: cmp r8, #0 +; CHECK-NEXT: mvnne r8, #0 +; CHECK-NEXT: vld1.64 {d28, d29}, [r0:128] +; CHECK-NEXT: add r0, r1, #32 +; CHECK-NEXT: vld1.64 {d30, d31}, [r0:128] +; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: vdup.32 d7, r8 +; CHECK-NEXT: vdup.32 d6, r12 +; CHECK-NEXT: subs r4, r6, r4 +; CHECK-NEXT: sbcs r4, r7, r5 +; CHECK-NEXT: vmov r5, r6, d24 +; CHECK-NEXT: vmov r7, r2, d26 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mvnne r4, #0 +; CHECK-NEXT: vdup.32 d5, r4 +; CHECK-NEXT: subs r5, r7, r5 +; CHECK-NEXT: sbcs r2, r2, r6 +; CHECK-NEXT: vmov r7, r6, d27 +; CHECK-NEXT: vmov r2, r9, d25 +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: mvnne r5, #0 +; CHECK-NEXT: subs r2, r7, r2 +; CHECK-NEXT: sbcs r2, r6, r9 +; CHECK-NEXT: vmov r6, r7, d22 ; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: sbcs r0, r6, r0 -; CHECK-NEXT: vmov.32 r6, d28[1] -; CHECK-NEXT: vmov.32 r0, d24[1] ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d7, r2 -; CHECK-NEXT: vdup.32 d6, r12 -; CHECK-NEXT: subs r5, r5, r4 -; CHECK-NEXT: vmov.32 r4, d25[1] -; CHECK-NEXT: vmov.32 r5, d25[0] -; CHECK-NEXT: sbcs r0, r6, r0 +; CHECK-NEXT: vdup.32 d1, r2 +; CHECK-NEXT: vdup.32 d0, r5 +; CHECK-NEXT: vbit q12, q13, q0 +; CHECK-NEXT: subs r0, r6, r0 +; CHECK-NEXT: vmov r2, r6, d28 +; CHECK-NEXT: sbcs r0, r7, r1 +; CHECK-NEXT: mov r7, #0 +; CHECK-NEXT: vmov r0, r1, d30 +; CHECK-NEXT: movlt r7, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: vmov r2, r5, d29 +; CHECK-NEXT: sbcs r0, r6, r1 ; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: vmov.32 r0, d29[0] +; CHECK-NEXT: vmov r0, r1, d31 ; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs r0, r5, r1 +; CHECK-NEXT: movlt lr, #1 +; CHECK-NEXT: cmp lr, #0 +; CHECK-NEXT: mvnne lr, #0 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: mvnne r6, #0 -; CHECK-NEXT: subs r0, r0, r5 -; CHECK-NEXT: vmov.32 r5, d21[0] -; CHECK-NEXT: vmov.32 r0, d29[1] -; CHECK-NEXT: sbcs r0, r0, r4 -; CHECK-NEXT: vmov.32 r4, d23[0] -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d1, r0 -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: vdup.32 d0, r6 -; CHECK-NEXT: vmov.32 r6, d22[0] -; CHECK-NEXT: vbit q12, q14, q0 -; CHECK-NEXT: subs r5, r4, r5 -; CHECK-NEXT: vmov.32 r4, d23[1] -; CHECK-NEXT: vmov.32 r5, d21[1] -; CHECK-NEXT: sbcs r5, r4, r5 -; CHECK-NEXT: vmov.32 r4, d20[1] -; CHECK-NEXT: vmov.32 r5, d20[0] -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d5, r0 -; CHECK-NEXT: add r0, r3, #32 -; CHECK-NEXT: subs r6, r6, r5 -; CHECK-NEXT: vmov.32 r5, d26[0] -; CHECK-NEXT: vmov.32 r6, d22[1] -; CHECK-NEXT: sbcs r6, r6, r4 -; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: vmov.32 r6, d30[0] -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: subs r6, r6, r5 -; CHECK-NEXT: vmov.32 r5, d30[1] -; CHECK-NEXT: vmov.32 r6, d26[1] -; CHECK-NEXT: sbcs r6, r5, r6 -; CHECK-NEXT: vmov.32 r5, d31[0] -; CHECK-NEXT: vmov.32 r6, d27[0] -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: subs r6, r5, r6 -; CHECK-NEXT: vmov.32 r5, d31[1] -; CHECK-NEXT: vmov.32 r6, d27[1] -; CHECK-NEXT: sbcs r6, r5, r6 -; CHECK-NEXT: movlt r7, #1 +; CHECK-NEXT: vdup.32 d3, lr +; CHECK-NEXT: vdup.32 d2, r6 ; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: vorr q13, q1, q1 ; CHECK-NEXT: mvnne r7, #0 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: vdup.32 d3, r7 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: vdup.32 d2, r1 -; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vbit q13, q15, q1 -; CHECK-NEXT: vdup.32 d4, r4 +; CHECK-NEXT: vdup.32 d4, r7 +; CHECK-NEXT: add r0, r3, #32 +; CHECK-NEXT: vbsl q13, q14, q15 ; CHECK-NEXT: vbit q10, q11, q2 ; CHECK-NEXT: vbit q8, q9, q3 ; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128] @@ -254,7 +233,7 @@ ; CHECK-NEXT: vst1.64 {d24, d25}, [r3:128]! ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] ; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128] -; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-NEXT: mov pc, lr %v0 = load %T0_19, %T0_19* %loadaddr %v1 = load %T0_19, %T0_19* %loadaddr2 @@ -280,202 +259,170 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, sp, #8 -; CHECK-NEXT: mov r8, r1 -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: vld1.64 {d16, d17}, [r8:128]! -; CHECK-NEXT: add r10, r0, #64 -; CHECK-NEXT: vld1.64 {d18, d19}, [r9:128]! -; CHECK-NEXT: vmov.32 r2, d16[0] +; CHECK-NEXT: add r9, r1, #64 +; CHECK-NEXT: mov r2, #32 +; CHECK-NEXT: add r8, r0, #64 +; CHECK-NEXT: vld1.64 {d16, d17}, [r9:128], r2 +; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: vld1.64 {d18, d19}, [r8:128], r2 +; CHECK-NEXT: vmov r7, r5, d17 +; CHECK-NEXT: vmov r6, r2, d19 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vmov.32 r6, d18[0] -; CHECK-NEXT: vmov.32 r4, d16[1] -; CHECK-NEXT: vmov.32 r7, d18[1] -; CHECK-NEXT: vmov.32 r5, d17[0] -; CHECK-NEXT: subs r2, r6, r2 -; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: vmov.32 r2, d19[0] -; CHECK-NEXT: sbcs r7, r7, r4 -; CHECK-NEXT: movlt r6, #1 -; CHECK-NEXT: vmov.32 r7, d17[1] -; CHECK-NEXT: subs r2, r2, r5 -; CHECK-NEXT: vmov.32 r2, d19[1] -; CHECK-NEXT: sbcs r2, r2, r7 +; CHECK-NEXT: vld1.64 {d22, d23}, [r10:128]! +; CHECK-NEXT: subs r7, r6, r7 +; CHECK-NEXT: sbcs r2, r2, r5 +; CHECK-NEXT: vmov r5, r6, d16 +; CHECK-NEXT: vmov r7, r4, d18 ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: vdup.32 d21, r2 -; CHECK-NEXT: mvnne r6, #0 -; CHECK-NEXT: vdup.32 d20, r6 -; CHECK-NEXT: mov r2, #32 -; CHECK-NEXT: add r6, r1, #64 -; CHECK-NEXT: vld1.64 {d24, d25}, [r10:128], r2 -; CHECK-NEXT: vbit q8, q9, q10 -; CHECK-NEXT: vld1.64 {d28, d29}, [r6:128], r2 -; CHECK-NEXT: vmov.32 r4, d29[0] -; CHECK-NEXT: vmov.32 r5, d25[0] -; CHECK-NEXT: vld1.64 {d0, d1}, [r9:128] -; CHECK-NEXT: vld1.64 {d2, d3}, [r8:128] -; CHECK-NEXT: vld1.64 {d22, d23}, [r6:128]! -; CHECK-NEXT: vld1.64 {d20, d21}, [r6:128] -; CHECK-NEXT: vmov.32 r6, d0[0] -; CHECK-NEXT: vld1.64 {d18, d19}, [r10:128]! -; CHECK-NEXT: vmov.32 r9, d23[0] -; CHECK-NEXT: vmov.32 r11, d19[0] -; CHECK-NEXT: vmov.32 r8, d23[1] -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: vmov.32 r5, d25[1] -; CHECK-NEXT: vmov.32 r4, d29[1] -; CHECK-NEXT: sbcs r4, r5, r4 -; CHECK-NEXT: vmov.32 r5, d24[0] -; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d5, r4 -; CHECK-NEXT: vmov.32 r4, d28[0] -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: vmov.32 r5, d24[1] -; CHECK-NEXT: vmov.32 r4, d28[1] -; CHECK-NEXT: sbcs r4, r5, r4 -; CHECK-NEXT: vmov.32 r5, d1[0] -; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d4, r4 -; CHECK-NEXT: vmov.32 r4, d3[0] -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: vmov.32 r5, d1[1] -; CHECK-NEXT: vmov.32 r4, d3[1] -; CHECK-NEXT: sbcs r4, r5, r4 -; CHECK-NEXT: add r5, r1, #32 -; CHECK-NEXT: vld1.64 {d26, d27}, [r5:128] -; CHECK-NEXT: add r5, r1, #48 +; CHECK-NEXT: subs r5, r7, r5 +; CHECK-NEXT: sbcs r4, r4, r6 ; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: add r1, r1, #80 -; CHECK-NEXT: vld1.64 {d30, d31}, [r5:128] ; CHECK-NEXT: movlt r4, #1 ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: vbif q12, q14, q2 -; CHECK-NEXT: vmov.32 r5, d2[0] ; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d29, r4 -; CHECK-NEXT: vmov.32 r4, d31[1] -; CHECK-NEXT: subs r5, r6, r5 -; CHECK-NEXT: vmov.32 r6, d0[1] -; CHECK-NEXT: vmov.32 r5, d2[1] -; CHECK-NEXT: sbcs r5, r6, r5 -; CHECK-NEXT: add r6, r0, #48 -; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: vld1.64 {d6, d7}, [r6:128] -; CHECK-NEXT: movlt r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: mvnne r5, #0 -; CHECK-NEXT: vmov.32 r7, d7[0] -; CHECK-NEXT: vdup.32 d28, r5 -; CHECK-NEXT: vmov.32 r5, d31[0] -; CHECK-NEXT: vbsl q14, q0, q1 -; CHECK-NEXT: vmov.32 r6, d7[1] -; CHECK-NEXT: vmov.32 r2, d6[0] -; CHECK-NEXT: subs r5, r7, r5 -; CHECK-NEXT: vmov.32 r7, d6[1] -; CHECK-NEXT: sbcs r4, r6, r4 -; CHECK-NEXT: vmov.32 r6, d30[0] -; CHECK-NEXT: vmov.32 r5, d30[1] +; CHECK-NEXT: vdup.32 d20, r4 +; CHECK-NEXT: vmov r2, r4, d23 +; CHECK-NEXT: vbit q8, q9, q10 +; CHECK-NEXT: vld1.64 {d18, d19}, [r11:128]! +; CHECK-NEXT: vmov r7, r5, d19 +; CHECK-NEXT: subs r2, r7, r2 +; CHECK-NEXT: sbcs r2, r5, r4 +; CHECK-NEXT: vmov r5, r7, d18 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: mvnne r2, #0 +; CHECK-NEXT: vdup.32 d21, r2 +; CHECK-NEXT: vmov r2, r4, d22 +; CHECK-NEXT: subs r2, r5, r2 +; CHECK-NEXT: sbcs r2, r7, r4 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: mvnne r2, #0 +; CHECK-NEXT: vdup.32 d20, r2 +; CHECK-NEXT: add r2, r0, #48 +; CHECK-NEXT: vbif q9, q11, q10 +; CHECK-NEXT: vld1.64 {d30, d31}, [r2:128] +; CHECK-NEXT: add r2, r1, #48 +; CHECK-NEXT: vld1.64 {d2, d3}, [r2:128] +; CHECK-NEXT: vmov r5, r7, d30 +; CHECK-NEXT: vmov r2, r4, d2 +; CHECK-NEXT: vld1.64 {d26, d27}, [r11:128] +; CHECK-NEXT: vld1.64 {d0, d1}, [r10:128] +; CHECK-NEXT: vld1.64 {d24, d25}, [r9:128]! +; CHECK-NEXT: vld1.64 {d22, d23}, [r9:128] +; CHECK-NEXT: vld1.64 {d20, d21}, [r8:128]! +; CHECK-NEXT: vmov r11, r10, d21 +; CHECK-NEXT: subs r2, r5, r2 +; CHECK-NEXT: sbcs r2, r7, r4 +; CHECK-NEXT: vmov r7, r6, d31 +; CHECK-NEXT: vmov r2, r5, d3 ; CHECK-NEXT: mov r4, #0 ; CHECK-NEXT: movlt r4, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d3, r4 -; CHECK-NEXT: vmov.32 r4, d26[1] -; CHECK-NEXT: subs r2, r2, r6 -; CHECK-NEXT: sbcs r2, r7, r5 -; CHECK-NEXT: add r5, r0, #32 +; CHECK-NEXT: subs r2, r7, r2 +; CHECK-NEXT: mov r7, #0 +; CHECK-NEXT: sbcs r2, r6, r5 +; CHECK-NEXT: vmov r6, r5, d27 +; CHECK-NEXT: vmov r2, r9, d1 +; CHECK-NEXT: movlt r7, #1 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: mvnne r7, #0 +; CHECK-NEXT: vdup.32 d7, r7 +; CHECK-NEXT: vdup.32 d6, r4 +; CHECK-NEXT: subs r2, r6, r2 +; CHECK-NEXT: sbcs r2, r5, r9 +; CHECK-NEXT: vmov r6, r5, d26 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: mvnne r2, #0 +; CHECK-NEXT: vdup.32 d5, r2 +; CHECK-NEXT: vmov r2, r9, d0 +; CHECK-NEXT: subs r2, r6, r2 +; CHECK-NEXT: sbcs r2, r5, r9 ; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: vld1.64 {d0, d1}, [r5:128] ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vmov.32 r6, d0[0] -; CHECK-NEXT: vdup.32 d2, r2 +; CHECK-NEXT: vdup.32 d4, r2 +; CHECK-NEXT: add r2, r1, #32 +; CHECK-NEXT: vld1.64 {d28, d29}, [r2:128] +; CHECK-NEXT: add r2, r0, #32 +; CHECK-NEXT: vbif q13, q0, q2 +; CHECK-NEXT: add r1, r1, #80 +; CHECK-NEXT: vld1.64 {d0, d1}, [r2:128] +; CHECK-NEXT: vmov r4, r5, d28 +; CHECK-NEXT: vbif q15, q1, q3 ; CHECK-NEXT: add r0, r0, #80 -; CHECK-NEXT: vmov.32 r2, d26[0] -; CHECK-NEXT: vbit q15, q3, q1 -; CHECK-NEXT: vmov.32 r5, d0[1] -; CHECK-NEXT: vmov.32 r7, d1[0] -; CHECK-NEXT: vld1.64 {d2, d3}, [r10:128] -; CHECK-NEXT: vld1.64 {d6, d7}, [r1:128] +; CHECK-NEXT: vmov r2, r6, d0 +; CHECK-NEXT: vld1.64 {d2, d3}, [r8:128] +; CHECK-NEXT: vmov r9, r8, d25 ; CHECK-NEXT: vld1.64 {d8, d9}, [r0:128] -; CHECK-NEXT: vmov.32 r1, d7[1] -; CHECK-NEXT: vmov.32 r10, d19[1] -; CHECK-NEXT: vmov.32 lr, d6[0] -; CHECK-NEXT: vmov.32 r3, d8[0] -; CHECK-NEXT: vmov.32 r12, d8[1] -; CHECK-NEXT: subs r2, r6, r2 -; CHECK-NEXT: vmov.32 r6, d1[1] -; CHECK-NEXT: sbcs r2, r5, r4 -; CHECK-NEXT: vmov.32 r5, d27[0] -; CHECK-NEXT: vmov.32 r4, d27[1] +; CHECK-NEXT: vld1.64 {d6, d7}, [r1:128] +; CHECK-NEXT: vmov r3, r12, d8 +; CHECK-NEXT: subs r2, r2, r4 +; CHECK-NEXT: sbcs r2, r6, r5 +; CHECK-NEXT: vmov r4, r5, d29 +; CHECK-NEXT: vmov r6, r7, d1 ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: subs r5, r7, r5 -; CHECK-NEXT: vmov.32 r7, d7[0] -; CHECK-NEXT: sbcs r4, r6, r4 -; CHECK-NEXT: vmov.32 r6, d2[0] +; CHECK-NEXT: subs r4, r6, r4 +; CHECK-NEXT: sbcs r4, r7, r5 +; CHECK-NEXT: vmov r5, r6, d2 ; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: vmov.32 r5, d2[1] ; CHECK-NEXT: movlt r4, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: mvnne r4, #0 ; CHECK-NEXT: vdup.32 d5, r4 ; CHECK-NEXT: vdup.32 d4, r2 -; CHECK-NEXT: vmov.32 r2, d20[0] -; CHECK-NEXT: vbit q13, q0, q2 -; CHECK-NEXT: vmov.32 r4, d20[1] -; CHECK-NEXT: subs r0, r6, r2 -; CHECK-NEXT: vmov.32 r2, d9[1] -; CHECK-NEXT: sbcs r0, r5, r4 -; CHECK-NEXT: vmov.32 r4, d9[0] -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: vmov.32 r6, d18[0] -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vmov.32 r5, d18[1] -; CHECK-NEXT: subs r4, r4, r7 -; CHECK-NEXT: vmov.32 r7, d21[1] -; CHECK-NEXT: sbcs r1, r2, r1 -; CHECK-NEXT: vmov.32 r4, d22[1] -; CHECK-NEXT: vmov.32 r1, d22[0] +; CHECK-NEXT: vmov r2, r4, d22 +; CHECK-NEXT: vbit q14, q0, q2 +; CHECK-NEXT: subs r2, r5, r2 +; CHECK-NEXT: sbcs r2, r6, r4 +; CHECK-NEXT: vmov r4, r5, d24 +; CHECK-NEXT: vmov r6, r7, d20 ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d11, r2 -; CHECK-NEXT: vmov.32 r2, d3[1] -; CHECK-NEXT: subs r1, r6, r1 -; CHECK-NEXT: vmov.32 r6, d21[0] -; CHECK-NEXT: sbcs r1, r5, r4 -; CHECK-NEXT: vmov.32 r4, d3[0] -; CHECK-NEXT: vmov.32 r5, d6[1] +; CHECK-NEXT: subs r1, r6, r4 +; CHECK-NEXT: vmov r0, r6, d9 +; CHECK-NEXT: sbcs r1, r7, r5 +; CHECK-NEXT: vmov r4, r5, d7 ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: subs r4, r4, r6 -; CHECK-NEXT: sbcs r2, r2, r7 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: subs r0, r0, r4 +; CHECK-NEXT: vmov r7, r4, d23 +; CHECK-NEXT: sbcs r0, r6, r5 +; CHECK-NEXT: vmov r5, lr, d6 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vdup.32 d11, r0 +; CHECK-NEXT: vmov r0, r6, d3 +; CHECK-NEXT: subs r0, r0, r7 +; CHECK-NEXT: sbcs r0, r6, r4 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: subs r4, r11, r9 ; CHECK-NEXT: sbcs r4, r10, r8 ; CHECK-NEXT: mov r4, #0 ; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: subs r3, r3, lr -; CHECK-NEXT: sbcs r3, r12, r5 +; CHECK-NEXT: subs r3, r3, r5 +; CHECK-NEXT: sbcs r3, r12, lr ; CHECK-NEXT: mov r3, #0 ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 @@ -485,28 +432,28 @@ ; CHECK-NEXT: vdup.32 d10, r3 ; CHECK-NEXT: vdup.32 d1, r4 ; CHECK-NEXT: vorr q2, q5, q5 -; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: vdup.32 d0, r1 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: vbsl q2, q4, q3 -; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vbif q9, q11, q0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vbif q10, q12, q0 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vdup.32 d7, r2 -; CHECK-NEXT: vdup.32 d6, r0 +; CHECK-NEXT: vdup.32 d7, r0 ; CHECK-NEXT: add r0, r1, #80 -; CHECK-NEXT: vbit q10, q1, q3 +; CHECK-NEXT: vdup.32 d6, r2 +; CHECK-NEXT: vbit q11, q1, q3 ; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128] ; CHECK-NEXT: add r0, r1, #32 -; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128] +; CHECK-NEXT: vst1.64 {d28, d29}, [r0:128] ; CHECK-NEXT: add r0, r1, #48 ; CHECK-NEXT: vst1.64 {d30, d31}, [r0:128] ; CHECK-NEXT: add r0, r1, #64 -; CHECK-NEXT: vst1.64 {d16, d17}, [r1:128]! -; CHECK-NEXT: vst1.64 {d28, d29}, [r1:128] +; CHECK-NEXT: vst1.64 {d18, d19}, [r1:128]! +; CHECK-NEXT: vst1.64 {d26, d27}, [r1:128] ; CHECK-NEXT: mov r1, #32 -; CHECK-NEXT: vst1.64 {d24, d25}, [r0:128], r1 -; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128]! -; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128] +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128], r1 +; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128]! +; CHECK-NEXT: vst1.64 {d22, d23}, [r0:128] ; CHECK-NEXT: add sp, sp, #8 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, sp, #4 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -1298,19 +1298,21 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* nocapture readonly %b, float* nocapture %c, i32 %N) { ; CHECK-LABEL: half_short_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq .LBB8_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB8_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: b .LBB8_6 ; CHECK-NEXT: .LBB8_3: @ %vector.ph -; CHECK-NEXT: bic r12, r3, #3 +; CHECK-NEXT: bic r7, r3, #3 +; CHECK-NEXT: str r7, [sp] @ 4-byte Spill +; CHECK-NEXT: subs r6, r7, #4 ; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: sub.w r6, r12, #4 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2 ; CHECK-NEXT: mov r5, r1 @@ -1319,19 +1321,17 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u32 q0, [r5], #8 ; CHECK-NEXT: ldr.w r9, [r4] -; CHECK-NEXT: ldr.w r8, [r4, #4] +; CHECK-NEXT: ldr.w r10, [r4, #4] ; CHECK-NEXT: adds r4, #8 -; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: vmov.16 q1[0], r7 -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vmov.16 q1[1], r7 -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: vmov.16 q1[2], r7 -; CHECK-NEXT: vmov r7, s3 -; CHECK-NEXT: vmov.16 q1[3], r7 -; CHECK-NEXT: vcvt.f16.s16 q0, q1 +; CHECK-NEXT: vmov r7, r12, d0 ; CHECK-NEXT: vmov.32 q1[0], r9 -; CHECK-NEXT: vmov.32 q1[1], r8 +; CHECK-NEXT: vmov r11, r8, d1 +; CHECK-NEXT: vmov.16 q0[0], r7 +; CHECK-NEXT: vmov.16 q0[1], r12 +; CHECK-NEXT: vmov.32 q1[1], r10 +; CHECK-NEXT: vmov.16 q0[2], r11 +; CHECK-NEXT: vmov.16 q0[3], r8 +; CHECK-NEXT: vcvt.f16.s16 q0, q0 ; CHECK-NEXT: vmul.f16 q0, q1, q0 ; CHECK-NEXT: vcvtt.f32.f16 s7, s1 ; CHECK-NEXT: vcvtb.f32.f16 s6, s1 @@ -1340,13 +1340,14 @@ ; CHECK-NEXT: vstrb.8 q1, [r6], #16 ; CHECK-NEXT: le lr, .LBB8_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r12, r3 +; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload +; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB8_8 ; CHECK-NEXT: .LBB8_6: @ %for.body.preheader13 -; CHECK-NEXT: sub.w lr, r3, r12 -; CHECK-NEXT: add.w r0, r0, r12, lsl #1 -; CHECK-NEXT: add.w r1, r1, r12, lsl #1 -; CHECK-NEXT: add.w r2, r2, r12, lsl #2 +; CHECK-NEXT: sub.w lr, r3, r7 +; CHECK-NEXT: add.w r0, r0, r7, lsl #1 +; CHECK-NEXT: add.w r1, r1, r7, lsl #1 +; CHECK-NEXT: add.w r2, r2, r7, lsl #2 ; CHECK-NEXT: .LBB8_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r3, [r1], #2 @@ -1359,7 +1360,8 @@ ; CHECK-NEXT: vstmia r2!, {s0} ; CHECK-NEXT: le lr, .LBB8_7 ; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %cmp10 = icmp eq i32 %N, 0 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader Index: llvm/test/CodeGen/Thumb2/active_lane_mask.ll =================================================================== --- llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -116,51 +116,43 @@ ; CHECK-NEXT: vadd.i32 q3, q0, r0 ; CHECK-NEXT: vcmp.u32 hi, q5, q3 ; CHECK-NEXT: vpsel q4, q2, q1 -; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vmov r1, r12, d8 ; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r1, s18 +; CHECK-NEXT: vmov.16 q0[1], r12 +; CHECK-NEXT: vmov r1, r12, d9 ; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: vmov r1, s19 -; CHECK-NEXT: vmov.16 q0[3], r1 ; CHECK-NEXT: adr r1, .LCPI2_1 ; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vmov.16 q0[3], r12 ; CHECK-NEXT: vadd.i32 q4, q4, r0 ; CHECK-NEXT: vcmp.u32 hi, q5, q4 ; CHECK-NEXT: vpsel q5, q2, q1 -; CHECK-NEXT: vmov r1, s20 +; CHECK-NEXT: vmov r1, r12, d10 ; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r1, s22 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vmov r1, s23 +; CHECK-NEXT: vmov.16 q0[5], r12 +; CHECK-NEXT: vmov r1, r12, d11 ; CHECK-NEXT: vdup.32 q5, r0 -; CHECK-NEXT: vmov.16 q0[7], r1 +; CHECK-NEXT: vmov.16 q0[6], r1 ; CHECK-NEXT: vcmp.u32 hi, q5, q3 +; CHECK-NEXT: vmov.16 q0[7], r12 ; CHECK-NEXT: vpsel q6, q2, q1 ; CHECK-NEXT: vcmp.u32 hi, q5, q4 -; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmov r0, r1, d12 ; CHECK-NEXT: vpsel q1, q2, q1 ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov.16 q3[1], r1 +; CHECK-NEXT: vmov r0, r1, d13 ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s27 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q3[3], r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q3[5], r1 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q3[7], r0 ; CHECK-NEXT: add r0, sp, #56 -; CHECK-NEXT: vcmp.i16 ne, q3, zr +; CHECK-NEXT: vmov.16 q3[7], r1 ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmp.i16 ne, q3, zr ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst ; CHECK-NEXT: vcmpt.i16 ne, q0, zr @@ -201,27 +193,23 @@ ; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: vcmp.u32 hi, q7, q1 ; CHECK-NEXT: vpsel q0, q4, q5 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r1, r12, d0 ; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.16 q2[1], r12 +; CHECK-NEXT: vmov r1, r12, d1 ; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov.16 q2[3], r1 ; CHECK-NEXT: adr r1, .LCPI3_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vmov.16 q2[3], r12 ; CHECK-NEXT: vadd.i32 q3, q0, r0 ; CHECK-NEXT: vcmp.u32 hi, q7, q3 ; CHECK-NEXT: vpsel q0, q4, q5 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r1, r12, d0 ; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.16 q2[5], r12 +; CHECK-NEXT: vmov r1, r12, d1 ; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov.16 q2[7], r1 +; CHECK-NEXT: vmov.16 q2[7], r12 ; CHECK-NEXT: vcmp.i16 ne, q2, zr ; CHECK-NEXT: vpsel q0, q4, q5 ; CHECK-NEXT: vmov.u16 r1, q0[0] @@ -246,28 +234,24 @@ ; CHECK-NEXT: vcmp.u32 hi, q7, q0 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: vpsel q6, q4, q5 -; CHECK-NEXT: vmov r1, s24 +; CHECK-NEXT: vmov r1, r12, d12 ; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov r1, s25 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r1, s26 +; CHECK-NEXT: vmov.16 q0[1], r12 +; CHECK-NEXT: vmov r1, r12, d13 ; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: vmov r1, s27 -; CHECK-NEXT: vmov.16 q0[3], r1 ; CHECK-NEXT: adr r1, .LCPI3_3 ; CHECK-NEXT: vldrw.u32 q6, [r1] +; CHECK-NEXT: vmov.16 q0[3], r12 ; CHECK-NEXT: vadd.i32 q6, q6, r0 ; CHECK-NEXT: vcmp.u32 hi, q7, q6 ; CHECK-NEXT: vpsel q7, q4, q5 -; CHECK-NEXT: vmov r1, s28 +; CHECK-NEXT: vmov r1, r12, d14 ; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov r1, s29 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r1, s30 +; CHECK-NEXT: vmov.16 q0[5], r12 +; CHECK-NEXT: vmov r1, r12, d15 ; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vmov r1, s31 -; CHECK-NEXT: vmov.16 q0[7], r1 ; CHECK-NEXT: vdup.32 q7, r0 +; CHECK-NEXT: vmov.16 q0[7], r12 ; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vpsel q0, q4, q5 ; CHECK-NEXT: vcmp.u32 hi, q7, q1 @@ -285,27 +269,23 @@ ; CHECK-NEXT: vmov.u16 r1, q0[5] ; CHECK-NEXT: vmov.8 q2[13], r1 ; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov.8 q2[14], r1 ; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.8 q2[15], r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vcmp.u32 hi, q7, q3 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[1], r1 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vpsel q1, q4, q5 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.8 q2[15], r1 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.16 q0[7], r1 ; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vpsel q0, q4, q5 ; CHECK-NEXT: vmov.u16 r0, q0[0] @@ -328,23 +308,19 @@ ; CHECK-NEXT: vcmp.u32 hi, q7, q0 ; CHECK-NEXT: vpsel q1, q4, q5 ; CHECK-NEXT: vcmp.u32 hi, q7, q6 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[1], r1 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vpsel q1, q4, q5 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.16 q0[7], r1 ; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vpsel q0, q4, q5 ; CHECK-NEXT: vmov.u16 r0, q0[0] @@ -423,50 +399,45 @@ ; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: vldrw.u32 q2, [r2] ; CHECK-NEXT: add.w lr, r3, r0, lsr #1 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vand q1, q1, q0 ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3[2], q3[0], r8, r8 -; CHECK-NEXT: vmov r7, s6 +; CHECK-NEXT: vmov q3[2], q3[0], r12, r12 +; CHECK-NEXT: vmov r6, r7, d3 ; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vmov r6, s7 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: add.w r8, r8, #2 +; CHECK-NEXT: add.w r12, r12, #2 +; CHECK-NEXT: vmov r2, r3, d7 ; CHECK-NEXT: vmov r9, s12 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r3, #1 -; CHECK-NEXT: vmov q3[2], q3[0], r9, r3 +; CHECK-NEXT: adds r0, r2, #1 +; CHECK-NEXT: vmov q3[2], q3[0], r9, r0 +; CHECK-NEXT: adc r8, r3, #0 ; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: adc r12, r2, #0 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov r4, s15 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: subs r7, r5, r7 -; CHECK-NEXT: vmov r7, s12 -; CHECK-NEXT: sbcs r4, r6 -; CHECK-NEXT: vmov r6, s13 -; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: vmov r3, r2, d2 +; CHECK-NEXT: vmov r4, r5, d7 +; CHECK-NEXT: subs r6, r4, r6 +; CHECK-NEXT: eor.w r0, r0, r4 +; CHECK-NEXT: sbcs r5, r7 +; CHECK-NEXT: vmov r6, r7, d6 +; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: subs r2, r7, r2 -; CHECK-NEXT: sbcs.w r0, r6, r0 -; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: movlo r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: subs r3, r6, r3 +; CHECK-NEXT: sbcs.w r2, r7, r2 +; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q3[2], q3[0], r0, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r4 -; CHECK-NEXT: eor.w r0, r5, r3 -; CHECK-NEXT: orrs.w r0, r0, r12 +; CHECK-NEXT: movlo r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: orrs.w r0, r0, r8 ; CHECK-NEXT: cset r0, ne +; CHECK-NEXT: vmov q3[2], q3[0], r2, r5 ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r5 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: teq.w r7, r9 +; CHECK-NEXT: teq.w r6, r9 ; CHECK-NEXT: cset r2, ne ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csetm r2, ne Index: llvm/test/CodeGen/Thumb2/mve-abs.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-abs.ll +++ llvm/test/CodeGen/Thumb2/mve-abs.ll @@ -40,19 +40,17 @@ define arm_aapcs_vfpcc <2 x i64> @abs_v2i64(<2 x i64> %s1) { ; CHECK-LABEL: abs_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: adds.w r1, r1, r0, asr #31 -; CHECK-NEXT: adc.w r12, r0, r0, asr #31 -; CHECK-NEXT: eor.w r1, r1, r0, asr #31 -; CHECK-NEXT: adds.w r2, r2, r3, asr #31 -; CHECK-NEXT: eor.w r0, r12, r0, asr #31 -; CHECK-NEXT: eor.w r2, r2, r3, asr #31 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: adc.w r1, r3, r3, asr #31 -; CHECK-NEXT: eor.w r1, r1, r3, asr #31 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r0, r0, r1, asr #31 +; CHECK-NEXT: adc.w r12, r1, r1, asr #31 +; CHECK-NEXT: adds.w r3, r3, r2, asr #31 +; CHECK-NEXT: eor.w r0, r0, r1, asr #31 +; CHECK-NEXT: eor.w r3, r3, r2, asr #31 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-NEXT: eor.w r0, r12, r1, asr #31 +; CHECK-NEXT: adc.w r1, r2, r2, asr #31 +; CHECK-NEXT: eor.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-ctlz.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-ctlz.ll +++ llvm/test/CodeGen/Thumb2/mve-ctlz.ll @@ -4,26 +4,24 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_0_t(<2 x i64> %src){ ; CHECK-LABEL: ctlz_2i64_0_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: clz r2, r2 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: add.w r2, r2, #32 -; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: clzne r2, r0 -; CHECK-NEXT: vmov s6, r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: clz r2, r2 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: add.w r2, r2, #32 -; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: clzne r0, r1 +; CHECK-NEXT: vmov s6, r0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: clzne r2, r0 -; CHECK-NEXT: vmov s4, r2 +; CHECK-NEXT: clzne r0, r1 +; CHECK-NEXT: vmov s4, r0 ; CHECK-NEXT: vldr s5, .LCPI0_0 ; CHECK-NEXT: vmov.f32 s7, s5 ; CHECK-NEXT: vmov q0, q1 @@ -70,26 +68,24 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_1_t(<2 x i64> %src){ ; CHECK-LABEL: ctlz_2i64_1_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: clz r2, r2 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: add.w r2, r2, #32 -; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: clzne r2, r0 -; CHECK-NEXT: vmov s6, r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: clz r2, r2 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: add.w r2, r2, #32 -; CHECK-NEXT: cset r1, ne +; CHECK-NEXT: clzne r0, r1 +; CHECK-NEXT: vmov s6, r0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: adds r0, #32 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: clzne r2, r0 -; CHECK-NEXT: vmov s4, r2 +; CHECK-NEXT: clzne r0, r1 +; CHECK-NEXT: vmov s4, r0 ; CHECK-NEXT: vldr s5, .LCPI4_0 ; CHECK-NEXT: vmov.f32 s7, s5 ; CHECK-NEXT: vmov q0, q1 Index: llvm/test/CodeGen/Thumb2/mve-ctpop.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-ctpop.ll +++ llvm/test/CodeGen/Thumb2/mve-ctpop.ll @@ -5,56 +5,54 @@ define arm_aapcs_vfpcc <2 x i64> @ctpop_2i64_t(<2 x i64> %src){ ; CHECK-LABEL: ctpop_2i64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: mov.w r1, #1431655765 -; CHECK-NEXT: mov.w lr, #858993459 -; CHECK-NEXT: mov.w r4, #16843009 -; CHECK-NEXT: and.w r2, r1, r0, lsr #1 -; CHECK-NEXT: subs r0, r0, r2 -; CHECK-NEXT: and.w r3, lr, r0, lsr #2 -; CHECK-NEXT: bic r0, r0, #-858993460 -; CHECK-NEXT: add r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: add.w r0, r0, r0, lsr #4 -; CHECK-NEXT: bic r12, r0, #-252645136 -; CHECK-NEXT: and.w r0, r1, r3, lsr #1 -; CHECK-NEXT: subs r0, r3, r0 -; CHECK-NEXT: and.w r3, lr, r0, lsr #2 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: mov.w lr, #1431655765 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: mov.w r12, #858993459 +; CHECK-NEXT: vldr s1, .LCPI0_0 +; CHECK-NEXT: and.w r0, lr, r2, lsr #1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: and.w r2, r12, r0, lsr #2 ; CHECK-NEXT: bic r0, r0, #-858993460 -; CHECK-NEXT: add r0, r3 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: and.w r2, lr, r1, lsr #1 +; CHECK-NEXT: subs r1, r1, r2 ; CHECK-NEXT: add.w r0, r0, r0, lsr #4 -; CHECK-NEXT: bic r0, r0, #-252645136 -; CHECK-NEXT: muls r0, r4, r0 -; CHECK-NEXT: lsrs r0, r0, #24 -; CHECK-NEXT: and.w r2, r1, r3, lsr #1 +; CHECK-NEXT: and.w r2, r12, r1, lsr #2 +; CHECK-NEXT: bic r1, r1, #-858993460 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: and.w r2, lr, r3, lsr #1 ; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: and.w r3, lr, r2, lsr #2 +; CHECK-NEXT: bic r5, r0, #-252645136 +; CHECK-NEXT: add.w r1, r1, r1, lsr #4 +; CHECK-NEXT: mov.w r0, #16843009 +; CHECK-NEXT: and.w r3, r12, r2, lsr #2 ; CHECK-NEXT: bic r2, r2, #-858993460 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vldr s1, .LCPI0_0 +; CHECK-NEXT: and.w r3, lr, r4, lsr #1 +; CHECK-NEXT: subs r3, r4, r3 +; CHECK-NEXT: bic r1, r1, #-252645136 ; CHECK-NEXT: add.w r2, r2, r2, lsr #4 +; CHECK-NEXT: muls r5, r0, r5 +; CHECK-NEXT: and.w r4, r12, r3, lsr #2 +; CHECK-NEXT: bic r3, r3, #-858993460 ; CHECK-NEXT: bic r2, r2, #-252645136 -; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: add r3, r4 +; CHECK-NEXT: muls r1, r0, r1 +; CHECK-NEXT: add.w r3, r3, r3, lsr #4 +; CHECK-NEXT: muls r2, r0, r2 +; CHECK-NEXT: bic r3, r3, #-252645136 +; CHECK-NEXT: muls r0, r3, r0 +; CHECK-NEXT: lsrs r1, r1, #24 +; CHECK-NEXT: add.w r1, r1, r5, lsr #24 ; CHECK-NEXT: lsrs r2, r2, #24 -; CHECK-NEXT: and.w r1, r1, r3, lsr #1 -; CHECK-NEXT: subs r1, r3, r1 -; CHECK-NEXT: and.w r3, lr, r1, lsr #2 -; CHECK-NEXT: bic r1, r1, #-858993460 -; CHECK-NEXT: add r1, r3 -; CHECK-NEXT: mul r3, r12, r4 -; CHECK-NEXT: add.w r1, r1, r1, lsr #4 -; CHECK-NEXT: bic r1, r1, #-252645136 -; CHECK-NEXT: muls r1, r4, r1 -; CHECK-NEXT: add.w r0, r0, r3, lsr #24 -; CHECK-NEXT: vmov s2, r0 -; CHECK-NEXT: add.w r0, r2, r1, lsr #24 +; CHECK-NEXT: vmov s2, r1 +; CHECK-NEXT: add.w r0, r2, r0, lsr #24 ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vmov.f32 s3, s1 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI0_0: Index: llvm/test/CodeGen/Thumb2/mve-cttz.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-cttz.ll +++ llvm/test/CodeGen/Thumb2/mve-cttz.ll @@ -5,30 +5,28 @@ ; CHECK-LABEL: cttz_2i64_0_t: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: rbit r2, r2 +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: clz r2, r2 -; CHECK-NEXT: cset r1, ne -; CHECK-NEXT: adds r2, #32 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: adds r1, #32 ; CHECK-NEXT: rbit r0, r0 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: clzne r2, r0 -; CHECK-NEXT: vmov s2, r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: rbit r2, r2 +; CHECK-NEXT: clzne r1, r0 +; CHECK-NEXT: vmov s2, r1 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: clz r2, r2 -; CHECK-NEXT: cset r1, ne -; CHECK-NEXT: adds r2, #32 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: adds r1, #32 ; CHECK-NEXT: rbit r0, r0 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: clzne r2, r0 -; CHECK-NEXT: vmov s0, r2 +; CHECK-NEXT: clzne r1, r0 +; CHECK-NEXT: vmov s0, r1 ; CHECK-NEXT: vldr s1, .LCPI0_0 ; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: bx lr @@ -81,30 +79,28 @@ ; CHECK-LABEL: cttz_2i64_1_t: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: rbit r2, r2 +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: clz r2, r2 -; CHECK-NEXT: cset r1, ne -; CHECK-NEXT: adds r2, #32 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: adds r1, #32 ; CHECK-NEXT: rbit r0, r0 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: clzne r2, r0 -; CHECK-NEXT: vmov s2, r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: rbit r2, r2 +; CHECK-NEXT: clzne r1, r0 +; CHECK-NEXT: vmov s2, r1 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: clz r2, r2 -; CHECK-NEXT: cset r1, ne -; CHECK-NEXT: adds r2, #32 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: cset r2, ne +; CHECK-NEXT: adds r1, #32 ; CHECK-NEXT: rbit r0, r0 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: clzne r2, r0 -; CHECK-NEXT: vmov s0, r2 +; CHECK-NEXT: clzne r1, r0 +; CHECK-NEXT: vmov s0, r1 ; CHECK-NEXT: vldr s1, .LCPI4_0 ; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: bx lr Index: llvm/test/CodeGen/Thumb2/mve-div-expand.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-div-expand.ll +++ llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -5,22 +5,19 @@ define arm_aapcs_vfpcc <4 x i32> @udiv_i32(<4 x i32> %in1, <4 x i32> %in2) { ; CHECK-LABEL: udiv_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: udiv r0, r1, r0 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: udiv r1, r2, r1 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: udiv r0, r1, r0 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: udiv r1, r2, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: vmov r2, lr, d1 +; CHECK-NEXT: vmov r1, r3, d2 +; CHECK-NEXT: udiv r0, r2, r0 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: udiv r1, r4, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: udiv r0, lr, r12 +; CHECK-NEXT: udiv r1, r5, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %out = udiv <4 x i32> %in1, %in2 ret <4 x i32> %out @@ -29,22 +26,19 @@ define arm_aapcs_vfpcc <4 x i32> @sdiv_i32(<4 x i32> %in1, <4 x i32> %in2) { ; CHECK-LABEL: sdiv_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: sdiv r0, r1, r0 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: sdiv r1, r2, r1 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: sdiv r0, r1, r0 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: sdiv r1, r2, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: vmov r2, lr, d1 +; CHECK-NEXT: vmov r1, r3, d2 +; CHECK-NEXT: sdiv r0, r2, r0 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: sdiv r1, r4, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: sdiv r0, lr, r12 +; CHECK-NEXT: sdiv r1, r5, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %out = sdiv <4 x i32> %in1, %in2 ret <4 x i32> %out @@ -53,27 +47,23 @@ define arm_aapcs_vfpcc <4 x i32> @urem_i32(<4 x i32> %in1, <4 x i32> %in2) { ; CHECK-LABEL: urem_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: udiv r2, r1, r0 -; CHECK-NEXT: mls r12, r2, r0, r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: udiv r3, r2, r1 -; CHECK-NEXT: mls lr, r3, r1, r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: udiv r0, r3, r2 -; CHECK-NEXT: mls r0, r0, r2, r3 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 -; CHECK-NEXT: udiv r1, r3, r2 -; CHECK-NEXT: mls r1, r1, r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r1, lr, d2 +; CHECK-NEXT: udiv r4, r2, r0 +; CHECK-NEXT: mls r0, r4, r0, r2 +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: udiv r5, r2, r1 +; CHECK-NEXT: mls r1, r5, r1, r2 +; CHECK-NEXT: udiv r2, r3, r12 +; CHECK-NEXT: mls r2, r2, r12, r3 +; CHECK-NEXT: udiv r3, r4, lr +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: mls r3, r3, lr, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %out = urem <4 x i32> %in1, %in2 ret <4 x i32> %out @@ -82,27 +72,23 @@ define arm_aapcs_vfpcc <4 x i32> @srem_i32(<4 x i32> %in1, <4 x i32> %in2) { ; CHECK-LABEL: srem_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: sdiv r2, r1, r0 -; CHECK-NEXT: mls r12, r2, r0, r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: sdiv r3, r2, r1 -; CHECK-NEXT: mls lr, r3, r1, r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: sdiv r0, r3, r2 -; CHECK-NEXT: mls r0, r0, r2, r3 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 -; CHECK-NEXT: sdiv r1, r3, r2 -; CHECK-NEXT: mls r1, r1, r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r1, lr, d2 +; CHECK-NEXT: sdiv r4, r2, r0 +; CHECK-NEXT: mls r0, r4, r0, r2 +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: sdiv r5, r2, r1 +; CHECK-NEXT: mls r1, r5, r1, r2 +; CHECK-NEXT: sdiv r2, r3, r12 +; CHECK-NEXT: mls r2, r2, r12, r3 +; CHECK-NEXT: sdiv r3, r4, lr +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: mls r3, r3, lr, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %out = srem <4 x i32> %in1, %in2 ret <4 x i32> %out @@ -637,17 +623,13 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s23 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov r0, r1, d11 +; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl __aeabi_uldivmod ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: bl __aeabi_uldivmod ; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 @@ -667,17 +649,13 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s23 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov r0, r1, d11 +; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 @@ -697,17 +675,13 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s23 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov r0, r1, d11 +; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl __aeabi_uldivmod ; CHECK-NEXT: mov r4, r2 ; CHECK-NEXT: mov r5, r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: bl __aeabi_uldivmod ; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r5 @@ -727,17 +701,13 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s23 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov r0, r1, d11 +; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: mov r4, r2 ; CHECK-NEXT: mov r5, r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r5 @@ -774,24 +744,22 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s18 +; CHECK-NEXT: vmov r0, r4, d11 +; CHECK-NEXT: vmov r1, r5, d9 ; CHECK-NEXT: bl fmodf -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r1, s19 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl fmodf -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r6, s16 +; CHECK-NEXT: vmov r4, r2, d10 +; CHECK-NEXT: vmov r5, r1, d8 ; CHECK-NEXT: vmov s19, r0 -; CHECK-NEXT: vmov r5, s20 -; CHECK-NEXT: vmov s18, r4 +; CHECK-NEXT: vmov s18, r6 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s17, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov q0, q4 Index: llvm/test/CodeGen/Thumb2/mve-fmath.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -74,19 +74,18 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r0, r4, d9 ; CHECK-NEXT: bl cosf -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl cosf -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 -; CHECK-NEXT: vmov s18, r4 +; CHECK-NEXT: vmov s18, r5 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s17, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov q0, q4 @@ -186,19 +185,18 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r0, r4, d9 ; CHECK-NEXT: bl sinf -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl sinf -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 -; CHECK-NEXT: vmov s18, r4 +; CHECK-NEXT: vmov s18, r5 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s17, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov q0, q4 @@ -298,19 +296,18 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r0, r4, d9 ; CHECK-NEXT: bl expf -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl expf -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 -; CHECK-NEXT: vmov s18, r4 +; CHECK-NEXT: vmov s18, r5 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s17, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov q0, q4 @@ -410,19 +407,18 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r0, r4, d9 ; CHECK-NEXT: bl exp2f -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl exp2f -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 -; CHECK-NEXT: vmov s18, r4 +; CHECK-NEXT: vmov s18, r5 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s17, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov q0, q4 @@ -522,19 +518,18 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r0, r4, d9 ; CHECK-NEXT: bl logf -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl logf -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 -; CHECK-NEXT: vmov s18, r4 +; CHECK-NEXT: vmov s18, r5 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s17, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov q0, q4 @@ -634,19 +629,18 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r0, r4, d9 ; CHECK-NEXT: bl log2f -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl log2f -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 -; CHECK-NEXT: vmov s18, r4 +; CHECK-NEXT: vmov s18, r5 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s17, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov q0, q4 @@ -746,19 +740,18 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r0, r4, d9 ; CHECK-NEXT: bl log10f -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl log10f -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 -; CHECK-NEXT: vmov s18, r4 +; CHECK-NEXT: vmov s18, r5 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s17, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov q0, q4 @@ -859,24 +852,22 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s18 +; CHECK-NEXT: vmov r0, r4, d11 +; CHECK-NEXT: vmov r1, r5, d9 ; CHECK-NEXT: bl powf -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r1, s19 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl powf -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: vmov r6, s16 +; CHECK-NEXT: vmov r4, r2, d10 +; CHECK-NEXT: vmov r5, r1, d8 ; CHECK-NEXT: vmov s19, r0 -; CHECK-NEXT: vmov r5, s20 -; CHECK-NEXT: vmov s18, r4 +; CHECK-NEXT: vmov s18, r6 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s17, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov q0, q4 @@ -993,26 +984,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r12, s7 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: lsrs r0, r0, #31 -; CHECK-NEXT: bfi r3, r0, #31, #1 -; CHECK-NEXT: lsr.w r0, lr, #31 -; CHECK-NEXT: bfi r2, r0, #31, #1 -; CHECK-NEXT: lsr.w r0, r12, #31 -; CHECK-NEXT: bfi r1, r0, #31, #1 -; CHECK-NEXT: vmov s3, r1 -; CHECK-NEXT: lsrs r0, r4, #31 -; CHECK-NEXT: vmov s2, r2 -; CHECK-NEXT: bfi r5, r0, #31, #1 -; CHECK-NEXT: vmov s1, r3 -; CHECK-NEXT: vmov s0, r5 +; CHECK-NEXT: vmov r12, r1, d2 +; CHECK-NEXT: vmov r2, lr, d3 +; CHECK-NEXT: vmov r3, r0, d0 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: lsrs r1, r1, #31 +; CHECK-NEXT: bfi r0, r1, #31, #1 +; CHECK-NEXT: lsrs r1, r2, #31 +; CHECK-NEXT: bfi r4, r1, #31, #1 +; CHECK-NEXT: lsr.w r1, lr, #31 +; CHECK-NEXT: bfi r5, r1, #31, #1 +; CHECK-NEXT: lsr.w r1, r12, #31 +; CHECK-NEXT: bfi r3, r1, #31, #1 +; CHECK-NEXT: vmov s3, r5 +; CHECK-NEXT: vmov s2, r4 +; CHECK-NEXT: vmov s1, r0 +; CHECK-NEXT: vmov s0, r3 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = call fast <4 x float> @llvm.copysign.v4f32(<4 x float> %src1, <4 x float> %src2) Index: llvm/test/CodeGen/Thumb2/mve-gather-increment.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -41,31 +41,27 @@ ; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r1] -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrh.w lr, [r2] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: vmov r1, lr, d2 +; CHECK-NEXT: vmov r3, r12, d3 +; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q0[0], r4 ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: ldrh.w lr, [lr] +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[5], lr +; CHECK-NEXT: ldrh.w r12, [r12] +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r12 ; CHECK-NEXT: pop {r4, r5, r7, pc} %1 = add <8 x i32> %offs, %2 = getelementptr inbounds i16, i16* %data, <8 x i32> %1 @@ -111,61 +107,53 @@ ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i32 q4, #0x10 ; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: vmov r1, r2, d7 +; CHECK-NEXT: vmov r3, r4, d6 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vadd.i32 q3, q0, q4 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vadd.i32 q1, q1, q4 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vadd.i32 q2, q2, q4 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: ldrb r3, [r1] -; CHECK-NEXT: vmov r1, s13 -; CHECK-NEXT: vadd.i32 q3, q0, q4 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r6, s15 -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb r1, [r4] +; CHECK-NEXT: ldrb r4, [r5] +; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: vmov r5, r6, d6 +; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: vmov.8 q0[0], r5 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r5, [r6] ; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q0[2], r5 -; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vmov r5, r6, d7 +; CHECK-NEXT: ldrb r0, [r5] +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, r5, d2 ; CHECK-NEXT: vmov.8 q0[3], r6 -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov.8 q0[5], r5 +; CHECK-NEXT: vmov r0, r5, d3 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.8 q0[7], r5 +; CHECK-NEXT: vmov r0, r5, d4 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.8 q0[8], r5 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r2 -; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r5 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r2 ; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov.8 q0[14], r12 -; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: vmov.8 q0[14], lr +; CHECK-NEXT: vmov.8 q0[15], r12 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r6, pc} %1 = add <16 x i32> %offs, @@ -564,77 +552,71 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: blt .LBB11_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: add r2, sp, #16 +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: vmov.i16 q1, #0x8 -; CHECK-NEXT: bic r1, r1, #7 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: sub.w r3, r1, #8 -; CHECK-NEXT: add.w r8, r6, r3, lsr #3 -; CHECK-NEXT: adr r6, .LCPI11_0 -; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: bic r12, r1, #7 +; CHECK-NEXT: add r1, sp, #8 +; CHECK-NEXT: sub.w r3, r12, #8 +; CHECK-NEXT: add.w r8, r5, r3, lsr #3 +; CHECK-NEXT: adr r5, .LCPI11_0 +; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: .LBB11_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 ; CHECK-NEXT: dls lr, r8 ; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: ldr.w r12, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload ; CHECK-NEXT: .LBB11_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q2, [r2] -; CHECK-NEXT: mov r10, r2 -; CHECK-NEXT: vldrh.s32 q3, [r2] -; CHECK-NEXT: vldrh.s32 q4, [r2, #8] +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: vldrh.s32 q4, [r1, #8] +; CHECK-NEXT: vldrh.s32 q3, [r1] ; CHECK-NEXT: vadd.i16 q2, q2, q1 -; CHECK-NEXT: vshl.i32 q3, q3, #1 ; CHECK-NEXT: vshl.i32 q4, q4, #1 -; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vshl.i32 q3, q3, #1 ; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vmov r6, s15 -; CHECK-NEXT: vmov r9, s13 -; CHECK-NEXT: vmov r7, s14 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: vmov r2, s19 -; CHECK-NEXT: ldrh.w r11, [r6] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: ldrh.w r5, [r9] +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vmov r1, r2, d9 +; CHECK-NEXT: vmov r6, r7, d7 +; CHECK-NEXT: vmov r3, r4, d8 +; CHECK-NEXT: ldrh.w r11, [r2] +; CHECK-NEXT: vmov r2, r9, d6 +; CHECK-NEXT: ldrh r6, [r6] ; CHECK-NEXT: ldrh r7, [r7] ; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: vmov.16 q3[0], r6 -; CHECK-NEXT: vmov.16 q3[1], r5 -; CHECK-NEXT: vmov.16 q3[2], r7 -; CHECK-NEXT: vmov.16 q3[3], r11 +; CHECK-NEXT: ldrh.w r9, [r9] +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.16 q3[1], r9 +; CHECK-NEXT: vmov.16 q3[2], r6 +; CHECK-NEXT: vmov.16 q3[3], r7 ; CHECK-NEXT: vmov.16 q3[4], r3 ; CHECK-NEXT: vmov.16 q3[5], r4 ; CHECK-NEXT: vmov.16 q3[6], r1 -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: vstrb.8 q3, [r12], #16 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: vmov.16 q3[7], r11 +; CHECK-NEXT: vstrb.8 q3, [r5], #16 ; CHECK-NEXT: le lr, .LBB11_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: cmp r1, r3 +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: cmp r12, r2 ; CHECK-NEXT: bne .LBB11_2 ; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -690,8 +672,8 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #168 -; CHECK-NEXT: sub sp, #168 +; CHECK-NEXT: .pad #152 +; CHECK-NEXT: sub sp, #152 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: str r1, [sp, #72] @ 4-byte Spill ; CHECK-NEXT: mov r1, r2 @@ -706,144 +688,131 @@ ; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: subs r1, #8 ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: add r3, sp, #152 -; CHECK-NEXT: vmov.i16 q1, #0x18 +; CHECK-NEXT: vmov.i16 q2, #0x18 ; CHECK-NEXT: add.w r1, r2, r1, lsr #3 ; CHECK-NEXT: str r1, [sp, #68] @ 4-byte Spill ; CHECK-NEXT: adr r1, .LCPI12_0 ; CHECK-NEXT: adr r2, .LCPI12_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add r2, sp, #136 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: .LBB12_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 ; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload +; CHECK-NEXT: add.w r10, sp, #120 ; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr.w r10, [sp, #72] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: ldr r7, [sp, #72] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: .LBB12_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q5, [r3] -; CHECK-NEXT: vldrh.s32 q0, [r3, #8] -; CHECK-NEXT: mov r8, r3 -; CHECK-NEXT: add r7, sp, #120 +; CHECK-NEXT: vstrw.32 q5, [r2] +; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: vldrh.s32 q0, [r2, #8] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: ldrh r4, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh.w r11, [r1] -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vldrh.s32 q0, [r3] +; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: vldrh.s32 q0, [r2] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q2, q0, r0 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov r9, s8 -; CHECK-NEXT: ldrh.w r12, [r1] -; CHECK-NEXT: add r1, sp, #136 -; CHECK-NEXT: ldrh r5, [r3] -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: ldrh r6, [r3] -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vldrh.s32 q0, [r1, #8] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r6, r2, d4 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh.w r12, [r4] +; CHECK-NEXT: add r4, sp, #104 +; CHECK-NEXT: ldrh.w r11, [r5] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vstrw.32 q6, [r7] -; CHECK-NEXT: vldrh.s32 q3, [r1] -; CHECK-NEXT: ldrh.w r1, [r9] -; CHECK-NEXT: vldrh.s32 q4, [r7] -; CHECK-NEXT: vldrh.s32 q0, [r7, #8] -; CHECK-NEXT: vmov.16 q7[0], r1 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vshl.i32 q4, q4, #1 +; CHECK-NEXT: ldrh r5, [r6] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vstrw.32 q6, [r4] +; CHECK-NEXT: vldrh.s32 q0, [r4] +; CHECK-NEXT: vmov.16 q7[0], r5 +; CHECK-NEXT: vmov.16 q7[1], r2 ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q4, q4, r0 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vshl.i32 q3, q3, #1 -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q7[1], r1 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vmov.16 q7[2], r5 -; CHECK-NEXT: vmov.16 q7[3], r6 -; CHECK-NEXT: vmov.16 q7[4], r4 -; CHECK-NEXT: vmov.16 q7[5], r2 -; CHECK-NEXT: vmov.16 q7[6], r11 -; CHECK-NEXT: vmov.16 q7[7], r12 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov r1, s17 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov r1, s19 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov r1, s13 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: vadd.i16 q6, q6, q1 -; CHECK-NEXT: vadd.i16 q5, q5, q1 -; CHECK-NEXT: vadd.i16 q3, q3, q1 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vadd.i16 q0, q0, q2 +; CHECK-NEXT: vmov r6, r9, d0 +; CHECK-NEXT: vmov r2, r5, d1 +; CHECK-NEXT: vldrh.s32 q0, [r4, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q1[0], r6 +; CHECK-NEXT: ldrh.w r6, [r9] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q1[1], r6 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov r2, r6, d0 +; CHECK-NEXT: vmov.16 q1[3], r5 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov r2, r5, d1 +; CHECK-NEXT: vmov.16 q1[5], r6 +; CHECK-NEXT: mov r6, r10 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vstrw.32 q4, [r10] +; CHECK-NEXT: vldrh.s32 q0, [r6] +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.16 q1[7], r5 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r2, r5, d0 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.16 q3[1], r5 +; CHECK-NEXT: vmov r2, r5, d5 +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vadd.i16 q6, q6, q2 +; CHECK-NEXT: vadd.i16 q5, q5, q2 +; CHECK-NEXT: vadd.i16 q4, q4, q2 +; CHECK-NEXT: ldrh.w r9, [r2] +; CHECK-NEXT: vmov r2, r4, d1 +; CHECK-NEXT: vldrh.s32 q0, [r6, #8] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q7[2], r9 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vmov.16 q7[3], r5 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov.16 q7[4], r1 +; CHECK-NEXT: vmov.16 q7[5], r3 +; CHECK-NEXT: vmov.16 q7[6], r12 +; CHECK-NEXT: vmov.16 q7[7], r11 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.16 q3[3], r4 +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q3[4], r2 +; CHECK-NEXT: vmov.16 q3[5], r4 +; CHECK-NEXT: vmov r2, r4, d1 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: vmov.16 q3[7], r4 +; CHECK-NEXT: vadd.i16 q0, q3, q1 ; CHECK-NEXT: vadd.i16 q0, q0, q7 -; CHECK-NEXT: vstrb.8 q0, [r10], #16 +; CHECK-NEXT: vstrb.8 q0, [r7], #16 ; CHECK-NEXT: le lr, .LBB12_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r2, [sp, #76] @ 4-byte Reload -; CHECK-NEXT: cmp r1, r2 +; CHECK-NEXT: ldr r3, [sp, #76] @ 4-byte Reload +; CHECK-NEXT: cmp r1, r3 ; CHECK-NEXT: bne.w .LBB12_2 ; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #168 +; CHECK-NEXT: add sp, #152 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -929,27 +898,23 @@ ; CHECK-NEXT: .pad #328 ; CHECK-NEXT: sub sp, #328 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp, #120] @ 4-byte Spill -; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: str r2, [sp, #124] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #124] @ 4-byte Spill ; CHECK-NEXT: blt.w .LBB13_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: ldr r1, [sp, #124] @ 4-byte Reload -; CHECK-NEXT: adr.w r6, .LCPI13_8 -; CHECK-NEXT: adr.w r7, .LCPI13_7 -; CHECK-NEXT: adr.w r3, .LCPI13_6 -; CHECK-NEXT: bic r11, r1, #7 ; CHECK-NEXT: adr r1, .LCPI13_0 +; CHECK-NEXT: adr r6, .LCPI13_8 ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adr r1, .LCPI13_1 -; CHECK-NEXT: vmov.i32 q5, #0x30 -; CHECK-NEXT: str.w r11, [sp, #116] @ 4-byte Spill +; CHECK-NEXT: adr r7, .LCPI13_7 +; CHECK-NEXT: adr r3, .LCPI13_6 ; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adr r1, .LCPI13_5 +; CHECK-NEXT: bic r10, r2, #7 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr.w r6, .LCPI13_9 +; CHECK-NEXT: adr r6, .LCPI13_9 +; CHECK-NEXT: vmov.i32 q2, #0x30 ; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill @@ -962,243 +927,222 @@ ; CHECK-NEXT: .LBB13_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB13_3 Depth 2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: adr r1, .LCPI13_3 -; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adr r1, .LCPI13_4 -; CHECK-NEXT: vstrw.32 q2, [sp, #288] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vldrw.u32 q5, [r1] ; CHECK-NEXT: adr r1, .LCPI13_2 -; CHECK-NEXT: vstrw.32 q2, [sp, #224] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q3, [r1] ; CHECK-NEXT: adr r1, .LCPI13_10 -; CHECK-NEXT: vstrw.32 q2, [sp, #272] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #304] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q6, [sp, #288] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [sp, #304] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [r1] ; CHECK-NEXT: adr r1, .LCPI13_11 -; CHECK-NEXT: ldr.w r9, [sp, #120] @ 4-byte Reload -; CHECK-NEXT: vstrw.32 q2, [sp, #208] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [r1] -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: ldr.w r8, [sp, #124] @ 4-byte Reload +; CHECK-NEXT: vstrw.32 q3, [sp, #256] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q6, [sp, #272] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q7, [r1] +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q3, [sp, #208] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: mov r11, r10 +; CHECK-NEXT: vstrw.32 q6, [sp, #240] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #192] @ 16-byte Spill ; CHECK-NEXT: .LBB13_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q3, [sp, #240] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q3, q6, r0 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vstrw.32 q1, [sp, #256] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q0, q7, r0 -; CHECK-NEXT: vstrw.32 q6, [sp, #160] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #256] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q4, q1, r0 ; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vmov r5, s7 -; CHECK-NEXT: vldrw.u32 q2, [sp, #240] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q6, q6, r0 -; CHECK-NEXT: vstrw.32 q4, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov r1, lr, d8 +; CHECK-NEXT: vadd.i32 q7, q7, r0 +; CHECK-NEXT: vmov r5, r4, d15 +; CHECK-NEXT: vadd.i32 q6, q0, r0 +; CHECK-NEXT: vmov r6, r7, d13 +; CHECK-NEXT: vstrw.32 q1, [sp, #160] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #304] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: vstrw.32 q1, [sp, #304] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vldrw.u32 q3, [sp, #224] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q5, [sp, #128] @ 16-byte Spill ; CHECK-NEXT: subs.w r11, r11, #16 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrb.w r9, [r1] +; CHECK-NEXT: vmov r1, r3, d14 ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldrb r6, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: ldrb.w r10, [r1] -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrb r4, [r1] -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: ldrb.w r8, [r1] -; CHECK-NEXT: vmov r1, s24 +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[0], r1 -; CHECK-NEXT: vmov r1, s25 -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[1], r1 -; CHECK-NEXT: vmov r1, s26 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[2], r1 -; CHECK-NEXT: vmov r1, s27 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[3], r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov.8 q7[4], r6 +; CHECK-NEXT: vmov r1, r3, d12 +; CHECK-NEXT: vmov.8 q7[2], r5 +; CHECK-NEXT: ldrb r5, [r6] +; CHECK-NEXT: ldrb r6, [r4] +; CHECK-NEXT: vmov.8 q7[3], r6 ; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: vmov.8 q6[0], r1 -; CHECK-NEXT: vmov r1, s13 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[1], r1 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: vadd.i32 q3, q4, r0 -; CHECK-NEXT: vldrw.u32 q4, [sp, #224] @ 16-byte Reload -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s15 +; CHECK-NEXT: vmov r6, r1, d2 +; CHECK-NEXT: vmov.8 q6[1], r3 +; CHECK-NEXT: vmov.8 q6[2], r5 +; CHECK-NEXT: vmov.8 q6[3], r7 +; CHECK-NEXT: ldrb.w r7, [lr] +; CHECK-NEXT: vmov.8 q6[4], r9 +; CHECK-NEXT: vmov.8 q6[5], r7 +; CHECK-NEXT: ldrb r4, [r1] +; CHECK-NEXT: vmov r1, r5, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #256] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q0, q1, r0 +; CHECK-NEXT: vstrw.32 q1, [sp, #256] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #240] @ 16-byte Reload +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, r3, d9 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vldrw.u32 q4, [sp, #192] @ 16-byte Reload ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[2], r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.8 q6[3], r12 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vldrw.u32 q1, [sp, #304] @ 16-byte Reload ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vstrw.32 q1, [sp, #304] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov.8 q6[6], r1 +; CHECK-NEXT: vmov r1, r7, d0 +; CHECK-NEXT: vmov.8 q6[7], r3 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q7[4], r1 +; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #272] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q7[5], r7 +; CHECK-NEXT: vstrw.32 q0, [sp, #272] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[4], r1 -; CHECK-NEXT: vmov r1, s13 -; CHECK-NEXT: vmov.8 q6[5], lr -; CHECK-NEXT: vmov.8 q6[6], r8 -; CHECK-NEXT: vmov.8 q6[7], r5 -; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.8 q7[6], r1 +; CHECK-NEXT: ldrb r1, [r6] +; CHECK-NEXT: vmov r7, r6, d0 +; CHECK-NEXT: vmov.8 q7[7], r3 +; CHECK-NEXT: vmov r3, lr, d1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #288] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q7[8], r1 ; CHECK-NEXT: vstrw.32 q0, [sp, #288] @ 16-byte Spill ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: ldrb r7, [r1] -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: vldrw.u32 q3, [sp, #208] @ 16-byte Reload -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q7[5], r5 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov.8 q7[6], r10 -; CHECK-NEXT: vmov.8 q7[7], r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov.8 q7[8], r2 +; CHECK-NEXT: vmov.8 q7[9], r4 +; CHECK-NEXT: vmov r4, r1, d0 +; CHECK-NEXT: vmov.8 q7[10], r12 +; CHECK-NEXT: vmov.8 q7[11], r5 +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: vmov.8 q7[9], r7 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[10], r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.8 q7[11], r3 -; CHECK-NEXT: vmov.8 q7[12], r6 -; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q7[13], r5 -; CHECK-NEXT: vmov.8 q7[14], r4 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[8], r1 -; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[8], r4 +; CHECK-NEXT: vmov r5, r4, d1 ; CHECK-NEXT: vmov.8 q6[9], r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[10], r1 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vadd.i32 q1, q2, r0 -; CHECK-NEXT: vldrw.u32 q2, [sp, #192] @ 16-byte Reload -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[11], r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[12], r1 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[13], r1 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q5, r0 +; CHECK-NEXT: vldrw.u32 q5, [sp, #208] @ 16-byte Reload +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q6[10], r5 +; CHECK-NEXT: vmov.8 q6[11], r4 +; CHECK-NEXT: vmov.8 q6[12], r7 +; CHECK-NEXT: vmov.8 q6[13], r6 +; CHECK-NEXT: vmov.8 q6[14], r3 +; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[14], r1 -; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vmov.8 q7[12], r1 +; CHECK-NEXT: ldrb r1, [r3] +; CHECK-NEXT: vmov.8 q7[13], r1 +; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vadd.i32 q0, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vstrw.32 q1, [sp, #240] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [sp, #256] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[15], r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vadd.i32 q0, q4, r0 -; CHECK-NEXT: vadd.i32 q4, q4, q5 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vstrw.32 q4, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vstrw.32 q1, [sp, #256] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, q2 ; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[14], r1 +; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[15], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i8 q6, q7, q6 -; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb.w r1, [lr] +; CHECK-NEXT: vmov.8 q6[15], r1 +; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vadd.i8 q6, q6, q7 ; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: vmov.8 q7[0], r1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov.8 q7[1], r2 +; CHECK-NEXT: vmov.8 q7[1], r3 +; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vadd.i32 q0, q3, r0 +; CHECK-NEXT: vadd.i32 q3, q3, q2 +; CHECK-NEXT: vstrw.32 q3, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #304] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q3, q3, q2 +; CHECK-NEXT: vstrw.32 q3, [sp, #304] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #288] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q3, q3, q2 +; CHECK-NEXT: vstrw.32 q3, [sp, #288] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #272] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q3, q3, q2 +; CHECK-NEXT: vstrw.32 q3, [sp, #272] @ 16-byte Spill ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[2], r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #272] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #272] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vldrw.u32 q4, [sp, #272] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q4, q4, q5 -; CHECK-NEXT: vstrw.32 q4, [sp, #272] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q4, [sp, #304] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q4, q4, q5 -; CHECK-NEXT: vstrw.32 q4, [sp, #304] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q4, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q4, q4, q5 -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[3], r1 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[4], r1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[5], r1 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vadd.i32 q0, q5, r0 +; CHECK-NEXT: vadd.i32 q5, q5, q2 +; CHECK-NEXT: vstrw.32 q5, [sp, #208] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q5, q5, q2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[6], r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vadd.i32 q0, q3, r0 -; CHECK-NEXT: vadd.i32 q3, q3, q5 -; CHECK-NEXT: vstrw.32 q3, [sp, #208] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #240] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q5 -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[7], r1 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[8], r1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[9], r1 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vadd.i32 q0, q4, r0 +; CHECK-NEXT: vadd.i32 q4, q4, q2 +; CHECK-NEXT: vstrw.32 q4, [sp, #192] @ 16-byte Spill ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[10], r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vadd.i32 q0, q2, r0 -; CHECK-NEXT: vadd.i32 q2, q2, q5 -; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #288] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q2, q2, q5 -; CHECK-NEXT: vstrw.32 q2, [sp, #288] @ 16-byte Spill -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[11], r1 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[12], r1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[13], r1 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r1, r3, d1 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[14], r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[15], r1 ; CHECK-NEXT: vadd.i8 q0, q6, q7 ; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp, #160] @ 16-byte Reload -; CHECK-NEXT: vstrb.8 q0, [r9], #16 +; CHECK-NEXT: vstrb.8 q0, [r8], #16 ; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q7, q7, q5 -; CHECK-NEXT: vadd.i32 q6, q6, q5 -; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: vadd.i32 q7, q7, q2 +; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: bne.w .LBB13_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #124] @ 4-byte Reload -; CHECK-NEXT: ldr.w r11, [sp, #116] @ 4-byte Reload -; CHECK-NEXT: cmp r11, r1 +; CHECK-NEXT: cmp r10, r2 ; CHECK-NEXT: bne.w .LBB13_2 ; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #328 @@ -1319,6 +1263,7 @@ ; CHECK-NEXT: .pad #72 ; CHECK-NEXT: sub sp, #72 ; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: str r1, [sp, #68] @ 4-byte Spill ; CHECK-NEXT: blt.w .LBB14_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader ; CHECK-NEXT: adr r5, .LCPI14_3 @@ -1326,7 +1271,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: adr r6, .LCPI14_2 ; CHECK-NEXT: adr r3, .LCPI14_0 -; CHECK-NEXT: bic r12, r2, #7 +; CHECK-NEXT: bic r1, r2, #7 ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] ; CHECK-NEXT: vmov.i32 q4, #0x10 @@ -1338,77 +1283,69 @@ ; CHECK-NEXT: .LBB14_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 +; CHECK-NEXT: ldr.w lr, [sp, #68] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: mov lr, r1 -; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r8, r1 ; CHECK-NEXT: .LBB14_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q1, q7, r0 -; CHECK-NEXT: vadd.i32 q2, q0, r0 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vadd.i32 q3, q5, r0 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: subs r3, #16 -; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vadd.i32 q2, q6, r0 +; CHECK-NEXT: vadd.i32 q1, q5, r0 +; CHECK-NEXT: vmov r6, r7, d5 +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: subs.w r8, r8, #16 +; CHECK-NEXT: vmov r3, r9, d4 +; CHECK-NEXT: vadd.i32 q2, q7, r0 ; CHECK-NEXT: vadd.i32 q5, q5, q4 +; CHECK-NEXT: vadd.i32 q6, q6, q4 ; CHECK-NEXT: vadd.i32 q7, q7, q4 ; CHECK-NEXT: vadd.i32 q0, q0, q4 -; CHECK-NEXT: ldrb.w r8, [r4] -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb.w r10, [r4] -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: ldrb.w r9, [r4] -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: ldrb.w r11, [r4] -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: vmov.8 q2[0], r6 -; CHECK-NEXT: vmov r6, s13 +; CHECK-NEXT: ldrb.w r11, [r6] +; CHECK-NEXT: ldrb.w r10, [r7] +; CHECK-NEXT: vmov r6, r7, d2 ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: vmov.8 q2[1], r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: vmov.8 q2[2], r6 -; CHECK-NEXT: vmov r6, s15 -; CHECK-NEXT: vadd.i32 q3, q6, r0 -; CHECK-NEXT: vadd.i32 q6, q6, q4 -; CHECK-NEXT: vmov r7, s12 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb.w r9, [r9] ; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q2[3], r6 -; CHECK-NEXT: vmov r6, s5 -; CHECK-NEXT: vmov.8 q2[4], r7 -; CHECK-NEXT: vmov r7, s13 +; CHECK-NEXT: vmov.8 q1[0], r6 +; CHECK-NEXT: vmov.8 q1[1], r7 +; CHECK-NEXT: vmov r6, r7, d5 +; CHECK-NEXT: vmov.8 q1[2], r4 +; CHECK-NEXT: vmov.8 q1[3], r5 +; CHECK-NEXT: vmov.8 q1[4], r3 +; CHECK-NEXT: vmov.8 q1[5], r9 +; CHECK-NEXT: vmov.8 q1[6], r11 +; CHECK-NEXT: vmov.8 q1[7], r10 +; CHECK-NEXT: ldrb.w r12, [r7] +; CHECK-NEXT: vmov r5, r7, d7 ; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb.w r9, [r7] +; CHECK-NEXT: vmov r7, r3, d6 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w r11, [r3] +; CHECK-NEXT: vmov r3, r4, d4 ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q2[5], r7 -; CHECK-NEXT: vmov r7, s14 -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q2[6], r7 -; CHECK-NEXT: vmov r7, s15 -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q2[7], r7 -; CHECK-NEXT: vmov r7, s4 -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q2[8], r7 -; CHECK-NEXT: vmov.8 q2[9], r6 -; CHECK-NEXT: vmov.8 q2[10], r8 -; CHECK-NEXT: vmov.8 q2[11], r10 -; CHECK-NEXT: vmov.8 q2[12], r5 -; CHECK-NEXT: vmov.8 q2[13], r9 -; CHECK-NEXT: vmov.8 q2[14], r11 -; CHECK-NEXT: vmov.8 q2[15], r4 -; CHECK-NEXT: vstrb.8 q2, [lr], #16 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q1[8], r3 +; CHECK-NEXT: vmov.8 q1[9], r4 +; CHECK-NEXT: vmov.8 q1[10], r6 +; CHECK-NEXT: vmov.8 q1[11], r12 +; CHECK-NEXT: vmov.8 q1[12], r7 +; CHECK-NEXT: vmov.8 q1[13], r11 +; CHECK-NEXT: vmov.8 q1[14], r5 +; CHECK-NEXT: vmov.8 q1[15], r9 +; CHECK-NEXT: vstrb.8 q1, [lr], #16 ; CHECK-NEXT: bne .LBB14_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1 -; CHECK-NEXT: cmp r12, r2 +; CHECK-NEXT: cmp r1, r2 ; CHECK-NEXT: bne .LBB14_2 ; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #72 Index: llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -49,36 +49,32 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.s32 q0, [r1, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r2, r12, d0 +; CHECK-NEXT: vmov r3, lr, d1 ; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrh.w lr, [lr] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r12 ; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 @@ -94,29 +90,25 @@ ; CHECK-NEXT: vldrh.s32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vldr.16 s8, [r2] -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vldr.16 s8, [r3] ; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vldr.16 s4, [r3] +; CHECK-NEXT: vmov r2, r3, d3 ; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vldr.16 s4, [r3] ; CHECK-NEXT: vldr.16 s1, [r2] ; CHECK-NEXT: vins.f16 s1, s4 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vldr.16 s8, [r1] -; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vldr.16 s3, [r0] -; CHECK-NEXT: vins.f16 s3, s8 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 @@ -259,39 +251,35 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0x28 -; CHECK-NEXT: vldrh.s32 q2, [r1, #8] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vmov.i32 q0, #0x28 +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vmov r2, r12, d2 +; CHECK-NEXT: vmov r3, lr, d3 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrh.w lr, [lr] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r12 ; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 @@ -378,51 +366,47 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep3(i16* %base) { ; CHECK-LABEL: scaled_v8i16_i16_biggep3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: adr r1, .LCPI17_0 -; CHECK-NEXT: adr.w r12, .LCPI17_1 +; CHECK-NEXT: adr r2, .LCPI17_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r12] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w lr, [r1] -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrh r6, [r2] -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r1, lr, d0 +; CHECK-NEXT: vmov r3, r12, d1 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh.w lr, [lr] ; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh.w r12, [r12] +; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q0[0], r4 ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], lr -; CHECK-NEXT: vmov.16 q0[3], r6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov.16 q0[5], lr +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r12 +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI17_0: -; CHECK-NEXT: .long 131072 @ 0x20000 -; CHECK-NEXT: .long 131078 @ 0x20006 -; CHECK-NEXT: .long 131084 @ 0x2000c -; CHECK-NEXT: .long 131090 @ 0x20012 -; CHECK-NEXT: .LCPI17_1: ; CHECK-NEXT: .long 131096 @ 0x20018 ; CHECK-NEXT: .long 131102 @ 0x2001e ; CHECK-NEXT: .long 131108 @ 0x20024 ; CHECK-NEXT: .long 131114 @ 0x2002a +; CHECK-NEXT: .LCPI17_1: +; CHECK-NEXT: .long 131072 @ 0x20000 +; CHECK-NEXT: .long 131078 @ 0x20006 +; CHECK-NEXT: .long 131084 @ 0x2000c +; CHECK-NEXT: .long 131090 @ 0x20012 entry: %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 65536 @@ -433,51 +417,47 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep4(i16* %base) { ; CHECK-LABEL: scaled_v8i16_i16_biggep4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: adr r1, .LCPI18_0 -; CHECK-NEXT: adr.w r12, .LCPI18_1 +; CHECK-NEXT: adr r2, .LCPI18_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r12] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w lr, [r1] -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrh r6, [r2] -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r1, lr, d0 +; CHECK-NEXT: vmov r3, r12, d1 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh.w lr, [lr] ; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh.w r12, [r12] +; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q0[0], r4 ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], lr -; CHECK-NEXT: vmov.16 q0[3], r6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov.16 q0[5], lr +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r12 +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI18_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 18 @ 0x12 -; CHECK-NEXT: .LCPI18_1: ; CHECK-NEXT: .long 24 @ 0x18 ; CHECK-NEXT: .long 131072 @ 0x20000 ; CHECK-NEXT: .long 36 @ 0x24 ; CHECK-NEXT: .long 42 @ 0x2a +; CHECK-NEXT: .LCPI18_1: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 18 @ 0x12 entry: %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) @@ -487,36 +467,32 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep5(<8 x i16*> %base) { ; CHECK-LABEL: scaled_v8i16_i16_biggep5: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vmov.i32 q2, #0x20000 ; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: ldrh.w lr, [r1] -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r1, lr, d1 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov r0, r12, d2 +; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: ldrh r6, [r3] +; CHECK-NEXT: ldrh.w r3, [lr] +; CHECK-NEXT: vmov.16 q0[2], r1 ; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: ldrh.w r12, [r12] +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r12 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r6 +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %ptrs2 = getelementptr inbounds i16,<8 x i16*> %base, i32 65536 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) @@ -526,51 +502,47 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep6(i16* %base) { ; CHECK-LABEL: scaled_v8i16_i16_biggep6: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: adr r1, .LCPI20_0 -; CHECK-NEXT: adr.w r12, .LCPI20_1 +; CHECK-NEXT: adr r2, .LCPI20_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r12] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w lr, [r1] -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrh r6, [r2] -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r1, lr, d0 +; CHECK-NEXT: vmov r3, r12, d1 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh.w lr, [lr] ; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh.w r12, [r12] +; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q0[0], r4 ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], lr -; CHECK-NEXT: vmov.16 q0[3], r6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov.16 q0[5], lr +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r12 +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI20_0: -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 14 @ 0xe -; CHECK-NEXT: .long 20 @ 0x14 -; CHECK-NEXT: .LCPI20_1: ; CHECK-NEXT: .long 131074 @ 0x20002 ; CHECK-NEXT: .long 32 @ 0x20 ; CHECK-NEXT: .long 38 @ 0x26 ; CHECK-NEXT: .long 44 @ 0x2c +; CHECK-NEXT: .LCPI20_1: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 14 @ 0xe +; CHECK-NEXT: .long 20 @ 0x14 entry: %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 1 @@ -581,51 +553,47 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep7(i16* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: scaled_v8i16_i16_biggep7: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: adr r1, .LCPI21_0 -; CHECK-NEXT: adr.w r12, .LCPI21_1 +; CHECK-NEXT: adr r2, .LCPI21_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r12] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w lr, [r1] -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrh r6, [r2] -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r1, lr, d0 +; CHECK-NEXT: vmov r3, r12, d1 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r2, d1 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh.w lr, [lr] ; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh.w r12, [r12] +; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q0[0], r4 ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], lr -; CHECK-NEXT: vmov.16 q0[3], r6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov.16 q0[5], lr +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r12 +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI21_0: -; CHECK-NEXT: .long 128 @ 0x80 -; CHECK-NEXT: .long 1206 @ 0x4b6 -; CHECK-NEXT: .long 1212 @ 0x4bc -; CHECK-NEXT: .long 1218 @ 0x4c2 -; CHECK-NEXT: .LCPI21_1: ; CHECK-NEXT: .long 1224 @ 0x4c8 ; CHECK-NEXT: .long 1230 @ 0x4ce ; CHECK-NEXT: .long 1236 @ 0x4d4 ; CHECK-NEXT: .long 1242 @ 0x4da +; CHECK-NEXT: .LCPI21_1: +; CHECK-NEXT: .long 128 @ 0x80 +; CHECK-NEXT: .long 1206 @ 0x4b6 +; CHECK-NEXT: .long 1212 @ 0x4bc +; CHECK-NEXT: .long 1218 @ 0x4c2 entry: %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 600 @@ -638,36 +606,32 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.u32 q0, [r1, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r2, r12, d0 +; CHECK-NEXT: vmov r3, lr, d1 ; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] ; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vshl.i32 q1, q1, #2 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrh.w lr, [lr] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r12 ; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 Index: llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll @@ -19,38 +19,34 @@ define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16_noext(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: zext_unscaled_i8_i16_noext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.s32 q0, [r1, #4] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r2, lr, d1 +; CHECK-NEXT: vmov r12, r3, d0 ; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vldrb.s32 q1, [r1, #4] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: ldrb r6, [r2] +; CHECK-NEXT: ldrb.w r2, [r12] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb.w lr, [lr] +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r4 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 +; CHECK-NEXT: vmov.16 q0[1], r5 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov.16 q0[6], r6 +; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 2 %ptrs = getelementptr inbounds i8, i8* %base, <8 x i8> %offs @@ -64,36 +60,32 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrb.s32 q0, [r1, #4] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r2, r12, d0 +; CHECK-NEXT: vmov r3, lr, d1 ; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vldrb.s32 q1, [r1, #4] ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrh.w lr, [lr] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r12 ; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 2 @@ -108,36 +100,32 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrb.u32 q0, [r1, #4] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r2, r12, d0 +; CHECK-NEXT: vmov r3, lr, d1 ; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh.w r12, [r12] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrh.w lr, [lr] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r12 ; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 2 Index: llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll @@ -461,16 +461,14 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q1, #0x10 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r1, [r1] ; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: ldr r1, [r1] ; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 ; CHECK-NEXT: bx lr entry: %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4 Index: llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -20,34 +20,30 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrb.u32 q0, [r1, #4] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r2, r12, d0 +; CHECK-NEXT: vmov r3, lr, d1 ; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb.w r12, [r12] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w lr, [lr] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrb r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r12 ; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 @@ -82,65 +78,57 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr) { ; CHECK-LABEL: unscaled_v16i8_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrb.s32 q0, [r1, #8] -; CHECK-NEXT: vldrb.s32 q2, [r1, #4] -; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vldrb.s32 q0, [r1, #12] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vldrb.s32 q0, [r1] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: ldrb r3, [r4] +; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r4 +; CHECK-NEXT: vmov r4, r6, d3 +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: ldrb r5, [r4] +; CHECK-NEXT: ldrb r4, [r6] +; CHECK-NEXT: vmov r6, r7, d5 +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: ldrb r0, [r6] +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov.8 q0[3], r7 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.8 q0[5], r1 +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.8 q0[7], r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r6 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r5 -; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r1 +; CHECK-NEXT: vmov.8 q0[10], r5 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov.8 q0[15], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 %offs.sext = sext <16 x i8> %offs to <16 x i32> @@ -152,65 +140,57 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr) { ; CHECK-LABEL: unscaled_v16i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrh.s32 q0, [r1, #16] -; CHECK-NEXT: vldrh.s32 q2, [r1, #8] -; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vldrh.s32 q0, [r1, #24] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vldrh.s32 q0, [r1, #16] +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: ldrb r3, [r4] +; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r4 +; CHECK-NEXT: vmov r4, r6, d3 +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: ldrb r5, [r4] +; CHECK-NEXT: ldrb r4, [r6] +; CHECK-NEXT: vmov r6, r7, d5 +; CHECK-NEXT: vldrh.s32 q2, [r1, #8] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: ldrb r0, [r6] +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov.8 q0[3], r7 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.8 q0[5], r1 +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.8 q0[7], r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r6 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r5 -; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r1 +; CHECK-NEXT: vmov.8 q0[10], r5 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov.8 q0[15], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <16 x i16>, <16 x i16>* %offptr, align 2 %offs.sext = sext <16 x i16> %offs to <16 x i32> @@ -222,69 +202,61 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr) { ; CHECK-LABEL: unscaled_v16i8_scaled: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1, #8] -; CHECK-NEXT: vldrb.u32 q2, [r1, #4] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vshl.i32 q2, q2, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vldrb.u32 q0, [r1, #12] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vmov r6, s4 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r5, s7 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vldrb.u32 q0, [r1, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: ldrb r3, [r4] +; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r4 +; CHECK-NEXT: vmov r4, r6, d3 +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: ldrb r5, [r4] +; CHECK-NEXT: ldrb r4, [r6] +; CHECK-NEXT: vmov r6, r7, d5 +; CHECK-NEXT: vldrb.u32 q2, [r1, #4] +; CHECK-NEXT: vshl.i32 q2, q2, #2 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: ldrb r0, [r6] +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov.8 q0[3], r7 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.8 q0[5], r1 +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.8 q0[7], r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r6 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r5 -; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r1 +; CHECK-NEXT: vmov.8 q0[10], r5 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov.8 q0[15], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 4 %offs.zext = zext <16 x i8> %offs to <16 x i32> @@ -297,65 +269,57 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr) { ; CHECK-LABEL: unscaled_v16i8_i8_next: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrw.u32 q0, [r1, #32] -; CHECK-NEXT: vldrw.u32 q2, [r1, #16] -; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r1, #48] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r1, #32] +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: ldrb r3, [r4] +; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r4 +; CHECK-NEXT: vmov r4, r6, d3 +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: ldrb r5, [r4] +; CHECK-NEXT: ldrb r4, [r6] +; CHECK-NEXT: vmov r6, r7, d5 +; CHECK-NEXT: vldrw.u32 q2, [r1, #16] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: ldrb r0, [r6] +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov.8 q0[3], r7 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.8 q0[5], r1 +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.8 q0[7], r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r6 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r5 -; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r1 +; CHECK-NEXT: vmov.8 q0[10], r5 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov.8 q0[15], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <16 x i32>, <16 x i32>* %offptr, align 4 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs @@ -372,59 +336,51 @@ ; CHECK-NEXT: vmov.i32 q2, #0x5 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb r3, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vldrb.s32 q0, [r1, #8] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q0, q2 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vmov r6, s4 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r4, s6 ; CHECK-NEXT: vadd.i32 q3, q0, q2 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r7, s15 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q0, q2 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: ldrb r3, [r4] +; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: vmov r4, r5, d6 ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q0[0], r5 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r4 +; CHECK-NEXT: vmov r4, r6, d3 ; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: ldrb r5, [r4] +; CHECK-NEXT: ldrb r4, [r6] +; CHECK-NEXT: vmov r6, r7, d7 ; CHECK-NEXT: vldrb.s32 q3, [r1, #4] ; CHECK-NEXT: vadd.i32 q3, q3, r0 ; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q0[2], r5 -; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: ldrb r0, [r6] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov.8 q0[3], r7 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.8 q0[5], r1 +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: vmov.8 q0[7], r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r4 -; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r1 +; CHECK-NEXT: vmov.8 q0[10], r5 +; CHECK-NEXT: vmov.8 q0[11], r4 ; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r2 ; CHECK-NEXT: vmov.8 q0[14], r12 @@ -543,81 +499,73 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep3(i8* %base) { ; CHECK-LABEL: unscaled_v16i8_i8_biggep3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: adr r1, .LCPI11_0 -; CHECK-NEXT: adr r2, .LCPI11_1 +; CHECK-NEXT: adr r4, .LCPI11_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r6, .LCPI11_2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: adr r7, .LCPI11_3 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r3, r5, d0 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vmov r4, r6, d3 ; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r3, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI11_3 -; CHECK-NEXT: vldrw.u32 q2, [r6] -; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: adr r1, .LCPI11_2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb r1, [r5] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: vmov r5, r6, d4 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: ldrb r5, [r6] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vldrw.u32 q2, [r7] ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r0, [r5] +; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r0, r5, d4 +; CHECK-NEXT: vmov.8 q0[3], r6 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.8 q0[5], r5 +; CHECK-NEXT: vmov r0, r5, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: vmov.8 q0[7], r5 +; CHECK-NEXT: vmov r0, r5, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], lr -; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov.8 q0[15], r2 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r5 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.8 q0[12], r3 +; CHECK-NEXT: vmov.8 q0[13], r1 +; CHECK-NEXT: vmov.8 q0[14], lr +; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI11_0: -; CHECK-NEXT: .long 280 @ 0x118 -; CHECK-NEXT: .long 283 @ 0x11b -; CHECK-NEXT: .long 286 @ 0x11e -; CHECK-NEXT: .long 289 @ 0x121 -; CHECK-NEXT: .LCPI11_1: ; CHECK-NEXT: .long 292 @ 0x124 ; CHECK-NEXT: .long 295 @ 0x127 ; CHECK-NEXT: .long 298 @ 0x12a ; CHECK-NEXT: .long 301 @ 0x12d +; CHECK-NEXT: .LCPI11_1: +; CHECK-NEXT: .long 280 @ 0x118 +; CHECK-NEXT: .long 283 @ 0x11b +; CHECK-NEXT: .long 286 @ 0x11e +; CHECK-NEXT: .long 289 @ 0x121 ; CHECK-NEXT: .LCPI11_2: ; CHECK-NEXT: .long 256 @ 0x100 ; CHECK-NEXT: .long 259 @ 0x103 @@ -639,81 +587,73 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep4(i8* %base) { ; CHECK-LABEL: unscaled_v16i8_i8_biggep4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: adr r1, .LCPI12_0 -; CHECK-NEXT: adr r2, .LCPI12_1 +; CHECK-NEXT: adr r4, .LCPI12_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r6, .LCPI12_2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: adr r7, .LCPI12_3 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r3, r5, d0 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vmov r4, r6, d3 ; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r3, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI12_3 -; CHECK-NEXT: vldrw.u32 q2, [r6] -; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: adr r1, .LCPI12_2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb r1, [r5] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: vmov r5, r6, d4 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: ldrb r5, [r6] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vldrw.u32 q2, [r7] ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r0, [r5] +; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r0, r5, d4 +; CHECK-NEXT: vmov.8 q0[3], r6 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.8 q0[5], r5 +; CHECK-NEXT: vmov r0, r5, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: vmov.8 q0[7], r5 +; CHECK-NEXT: vmov r0, r5, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], lr -; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov.8 q0[15], r2 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r5 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.8 q0[12], r3 +; CHECK-NEXT: vmov.8 q0[13], r1 +; CHECK-NEXT: vmov.8 q0[14], lr +; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI12_0: -; CHECK-NEXT: .long 256 @ 0x100 -; CHECK-NEXT: .long 27 @ 0x1b -; CHECK-NEXT: .long 30 @ 0x1e -; CHECK-NEXT: .long 33 @ 0x21 -; CHECK-NEXT: .LCPI12_1: ; CHECK-NEXT: .long 36 @ 0x24 ; CHECK-NEXT: .long 39 @ 0x27 ; CHECK-NEXT: .long 42 @ 0x2a ; CHECK-NEXT: .long 45 @ 0x2d +; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .long 256 @ 0x100 +; CHECK-NEXT: .long 27 @ 0x1b +; CHECK-NEXT: .long 30 @ 0x1e +; CHECK-NEXT: .long 33 @ 0x21 ; CHECK-NEXT: .LCPI12_2: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 3 @ 0x3 @@ -734,65 +674,57 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep5(<16 x i8*> %base) { ; CHECK-LABEL: unscaled_v16i8_i8_biggep5: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i32 q4, #0x100 -; CHECK-NEXT: vadd.i32 q2, q2, q4 ; CHECK-NEXT: vadd.i32 q3, q3, q4 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: vmov r3, r2, d7 ; CHECK-NEXT: vadd.i32 q1, q1, q4 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: ldrb.w r12, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w lr, [r0] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r3, [r0] -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r0, r1, d6 ; CHECK-NEXT: vadd.i32 q3, q0, q4 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q0[0], r5 -; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb r1, [r5] +; CHECK-NEXT: vmov r2, r5, d6 +; CHECK-NEXT: ldrb r4, [r0] +; CHECK-NEXT: ldrb r0, [r6] +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r2 ; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q0[2], r5 -; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: vmov r2, r5, d7 +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[2], r2 ; CHECK-NEXT: vmov.8 q0[3], r5 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q0[4], r5 -; CHECK-NEXT: vmov r5, s5 +; CHECK-NEXT: vmov r2, r5, d2 +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[4], r2 ; CHECK-NEXT: vmov.8 q0[5], r5 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q0[6], r5 -; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov r2, r5, d3 +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[6], r2 ; CHECK-NEXT: vmov.8 q0[7], r5 -; CHECK-NEXT: vmov r5, s9 -; CHECK-NEXT: vmov.8 q0[8], r4 +; CHECK-NEXT: vmov r2, r5, d4 +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[8], r2 ; CHECK-NEXT: vmov.8 q0[9], r5 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], lr -; CHECK-NEXT: vmov.8 q0[12], r2 +; CHECK-NEXT: vmov.8 q0[10], r1 +; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.8 q0[12], r4 ; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.8 q0[15], r1 +; CHECK-NEXT: vmov.8 q0[14], lr +; CHECK-NEXT: vmov.8 q0[15], r12 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %ptrs2 = getelementptr inbounds i8, <16 x i8*> %base, i32 256 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) @@ -803,81 +735,73 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep6(i8* %base) { ; CHECK-LABEL: unscaled_v16i8_i8_biggep6: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: adr r1, .LCPI14_0 -; CHECK-NEXT: adr r2, .LCPI14_1 +; CHECK-NEXT: adr r4, .LCPI14_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r6, .LCPI14_2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: adr r7, .LCPI14_3 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r3, r5, d0 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vmov r4, r6, d3 ; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r3, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI14_3 -; CHECK-NEXT: vldrw.u32 q2, [r6] -; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: adr r1, .LCPI14_2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb r1, [r5] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: vmov r5, r6, d4 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: ldrb r5, [r6] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vldrw.u32 q2, [r7] ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r0, [r5] +; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r0, r5, d4 +; CHECK-NEXT: vmov.8 q0[3], r6 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.8 q0[5], r5 +; CHECK-NEXT: vmov r0, r5, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: vmov.8 q0[7], r5 +; CHECK-NEXT: vmov r0, r5, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], lr -; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov.8 q0[15], r2 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r5 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.8 q0[12], r3 +; CHECK-NEXT: vmov.8 q0[13], r1 +; CHECK-NEXT: vmov.8 q0[14], lr +; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI14_0: -; CHECK-NEXT: .long 257 @ 0x101 -; CHECK-NEXT: .long 28 @ 0x1c -; CHECK-NEXT: .long 31 @ 0x1f -; CHECK-NEXT: .long 34 @ 0x22 -; CHECK-NEXT: .LCPI14_1: ; CHECK-NEXT: .long 37 @ 0x25 ; CHECK-NEXT: .long 40 @ 0x28 ; CHECK-NEXT: .long 43 @ 0x2b ; CHECK-NEXT: .long 46 @ 0x2e +; CHECK-NEXT: .LCPI14_1: +; CHECK-NEXT: .long 257 @ 0x101 +; CHECK-NEXT: .long 28 @ 0x1c +; CHECK-NEXT: .long 31 @ 0x1f +; CHECK-NEXT: .long 34 @ 0x22 ; CHECK-NEXT: .LCPI14_2: ; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 4 @ 0x4 @@ -899,81 +823,73 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep7(i8* %base) { ; CHECK-LABEL: unscaled_v16i8_i8_biggep7: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: adr r1, .LCPI15_0 -; CHECK-NEXT: adr r2, .LCPI15_1 +; CHECK-NEXT: adr r4, .LCPI15_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r6, .LCPI15_2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: adr r7, .LCPI15_3 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r3, r5, d0 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vmov r4, r6, d3 ; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r3, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI15_3 -; CHECK-NEXT: vldrw.u32 q2, [r6] -; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: adr r1, .LCPI15_2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb r1, [r5] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: vmov r5, r6, d4 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: ldrb r5, [r6] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vldrw.u32 q2, [r7] ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r0, [r5] +; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r0, r5, d4 +; CHECK-NEXT: vmov.8 q0[3], r6 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.8 q0[5], r5 +; CHECK-NEXT: vmov r0, r5, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: vmov.8 q0[7], r5 +; CHECK-NEXT: vmov r0, r5, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], lr -; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov.8 q0[15], r2 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r5 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.8 q0[12], r3 +; CHECK-NEXT: vmov.8 q0[13], r1 +; CHECK-NEXT: vmov.8 q0[14], lr +; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI15_0: -; CHECK-NEXT: .long 224 @ 0xe0 -; CHECK-NEXT: .long 227 @ 0xe3 -; CHECK-NEXT: .long 230 @ 0xe6 -; CHECK-NEXT: .long 233 @ 0xe9 -; CHECK-NEXT: .LCPI15_1: ; CHECK-NEXT: .long 236 @ 0xec ; CHECK-NEXT: .long 239 @ 0xef ; CHECK-NEXT: .long 242 @ 0xf2 ; CHECK-NEXT: .long 245 @ 0xf5 +; CHECK-NEXT: .LCPI15_1: +; CHECK-NEXT: .long 224 @ 0xe0 +; CHECK-NEXT: .long 227 @ 0xe3 +; CHECK-NEXT: .long 230 @ 0xe6 +; CHECK-NEXT: .long 233 @ 0xe9 ; CHECK-NEXT: .LCPI15_2: ; CHECK-NEXT: .long 300 @ 0x12c ; CHECK-NEXT: .long 203 @ 0xcb @@ -995,65 +911,57 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2(i8* %base, <16 x i8>* %offptr) { ; CHECK-LABEL: unscaled_v16i8_i8_2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrb.s32 q0, [r1, #8] -; CHECK-NEXT: vldrb.s32 q2, [r1, #4] -; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vldrb.s32 q0, [r1, #12] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vldrb.s32 q0, [r1] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: ldrb r3, [r4] +; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r4 +; CHECK-NEXT: vmov r4, r6, d3 +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: ldrb r5, [r4] +; CHECK-NEXT: ldrb r4, [r6] +; CHECK-NEXT: vmov r6, r7, d5 +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: ldrb r0, [r6] +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov.8 q0[3], r7 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.8 q0[5], r1 +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.8 q0[7], r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r6 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r5 -; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r1 +; CHECK-NEXT: vmov.8 q0[10], r5 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov.8 q0[15], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs @@ -1097,69 +1005,61 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_basei16(i16* %base, <16 x i8>* %offptr) { ; CHECK-LABEL: unscaled_v16i8_basei16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1, #8] -; CHECK-NEXT: vldrb.u32 q2, [r1, #4] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vldrb.u32 q0, [r1, #12] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vmov r6, s4 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r5, s7 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vldrb.u32 q0, [r1, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: ldrb r3, [r4] +; CHECK-NEXT: ldrb r2, [r5] +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r4 +; CHECK-NEXT: vmov r4, r6, d3 +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: ldrb r5, [r4] +; CHECK-NEXT: ldrb r4, [r6] +; CHECK-NEXT: vmov r6, r7, d5 +; CHECK-NEXT: vldrb.u32 q2, [r1, #4] +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: ldrb r0, [r6] +; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov.8 q0[3], r7 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.8 q0[5], r1 +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: vmov.8 q0[7], r1 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r5 -; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r1 +; CHECK-NEXT: vmov.8 q0[10], r5 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], r3 ; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov.8 q0[15], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 %offs.zext = zext <16 x i8> %offs to <16 x i32> Index: llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -32,31 +32,27 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(<8 x i32*>* %offptr) { ; CHECK-LABEL: ptr_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r3, r12, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: ldr.w r12, [r1] -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: ldr.w lr, [r2] -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: vmov r0, lr, d1 +; CHECK-NEXT: ldr r7, [r2] +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: ldr r6, [r1] ; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 -; CHECK-NEXT: ldr r4, [r4] -; CHECK-NEXT: ldr r5, [r5] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 -; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: ldr.w r1, [r12] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r6 +; CHECK-NEXT: ldr.w r5, [lr] +; CHECK-NEXT: vmov q1[3], q1[1], r1, r7 ; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r2 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: ldr r4, [r4] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4 %gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %offs, i32 4, <8 x i1> , <8 x i32> undef) @@ -68,50 +64,42 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r7, s5 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: ldr r7, [r2] +; CHECK-NEXT: vmov r2, r6, d0 ; CHECK-NEXT: ldr.w r12, [r1] -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: ldr r3, [r3] +; CHECK-NEXT: ldr r4, [r4] ; CHECK-NEXT: ldr r5, [r5] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r12 +; CHECK-NEXT: ldr.w r1, [lr] +; CHECK-NEXT: vmov q3[3], q3[1], r1, r7 +; CHECK-NEXT: ldr r2, [r2] ; CHECK-NEXT: ldr r6, [r6] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 +; CHECK-NEXT: vmov r2, r4, d3 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r5 +; CHECK-NEXT: vmov r6, r5, d2 ; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: ldr r7, [r7] -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r4, [r4] -; CHECK-NEXT: ldr.w lr, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldr r3, [r1] -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r5 -; CHECK-NEXT: vmov r6, s7 -; CHECK-NEXT: vmov r5, s11 -; CHECK-NEXT: ldr r1, [r1] ; CHECK-NEXT: ldr r6, [r6] ; CHECK-NEXT: ldr r5, [r5] -; CHECK-NEXT: vmov q0[3], q0[1], r7, r6 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: ldr r6, [r6] -; CHECK-NEXT: vmov q1[2], q1[0], r6, r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov r6, s13 -; CHECK-NEXT: vmov q3[2], q3[0], r3, lr -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmov q1[2], q1[0], r6, r2 +; CHECK-NEXT: ldr r6, [r4] +; CHECK-NEXT: vmov r0, r2, d5 +; CHECK-NEXT: vmov q1[3], q1[1], r5, r6 +; CHECK-NEXT: vmov r6, r5, d4 ; CHECK-NEXT: ldr r0, [r0] ; CHECK-NEXT: ldr r6, [r6] -; CHECK-NEXT: vmov q1[3], q1[1], r6, r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r12 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: vmov q2[3], q2[1], r0, r5 +; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: ldr r5, [r5] +; CHECK-NEXT: vmov q2[2], q2[0], r6, r0 +; CHECK-NEXT: vmov q2[3], q2[1], r5, r2 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4 @@ -149,25 +137,23 @@ define arm_aapcs_vfpcc <8 x float> @ptr_v8f32(<8 x float*>* %offptr) { ; CHECK-LABEL: ptr_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vldr s3, [r1] -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vldr s2, [r1] -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov r12, r2, d1 +; CHECK-NEXT: vmov lr, r1, d0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vldr s3, [r2] +; CHECK-NEXT: vldr s2, [r12] ; CHECK-NEXT: vldr s1, [r1] -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vldr s7, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vldr s0, [r1] +; CHECK-NEXT: vldr s7, [r3] ; CHECK-NEXT: vldr s6, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vldr s5, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vldr s4, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vldr s5, [r5] +; CHECK-NEXT: vldr s0, [lr] +; CHECK-NEXT: vldr s4, [r4] +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x float*>, <8 x float*>* %offptr, align 4 %gather = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %offs, i32 4, <8 x i1> , <8 x float> undef) @@ -179,35 +165,31 @@ define arm_aapcs_vfpcc <8 x i16> @ptr_i16(<8 x i16*>* %offptr) { ; CHECK-LABEL: ptr_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: vmov r3, r12, d1 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r1] -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrh.w lr, [r2] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, lr, d1 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r6, [r3] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q0[0], r4 ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: ldrh.w r3, [lr] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: ldrh.w r12, [r12] +; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.16 q0[6], r6 +; CHECK-NEXT: vmov.16 q0[7], r12 +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> , <8 x i16> undef) @@ -253,10 +235,8 @@ ; CHECK-LABEL: ptr_v4i16_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r1, [r1] @@ -276,10 +256,8 @@ ; CHECK-LABEL: ptr_v4i16_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ldrh r0, [r0] ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r1, [r1] @@ -298,42 +276,38 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v8i16_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r3, r1, d1 +; CHECK-NEXT: vmov r12, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: ldrh.w lr, [r1] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r1, [r0] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov lr, r0, d1 +; CHECK-NEXT: ldrh r7, [r1] +; CHECK-NEXT: ldrh.w r1, [r12] +; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: ldrh r5, [r0] -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r0, r5, d0 +; CHECK-NEXT: ldrh.w r6, [lr] +; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r5, [r5] ; CHECK-NEXT: vmov.16 q0[0], r0 ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r4 -; CHECK-NEXT: vmov.16 q0[7], r5 +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r6 +; CHECK-NEXT: vmov.16 q0[3], r4 +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r7 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q1, [r0, #8] ; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> , <8 x i16> undef) @@ -344,33 +318,29 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(<8 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v8i16_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r12, r3, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: ldrh.w r12, [r1] -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: ldrh.w lr, [r2] -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, lr, d1 +; CHECK-NEXT: ldrh r7, [r2] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh.w r2, [r12] +; CHECK-NEXT: ldrh r4, [r4] ; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh.w r6, [lr] +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r2 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r7 ; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 +; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> , <8 x i16> undef) @@ -384,27 +354,23 @@ ; CHECK-LABEL: ptr_f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vldr.16 s8, [r1] -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov r1, r2, d2 +; CHECK-NEXT: vldr.16 s8, [r2] ; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vldr.16 s4, [r2] +; CHECK-NEXT: vmov r1, r2, d3 ; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vldr.16 s4, [r2] ; CHECK-NEXT: vldr.16 s1, [r1] ; CHECK-NEXT: vins.f16 s1, s4 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vldr.16 s8, [r1] -; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vldr.16 s2, [r0] +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vldr.16 s3, [r0] -; CHECK-NEXT: vins.f16 s3, s8 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %offs = load <8 x half*>, <8 x half*>* %offptr, align 4 @@ -417,61 +383,53 @@ define arm_aapcs_vfpcc <16 x i8> @ptr_i8(<16 x i8*>* %offptr) { ; CHECK-LABEL: ptr_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r6, s11 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r3, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov.8 q0[0], r5 -; CHECK-NEXT: vmov r5, s9 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q0[1], r5 -; CHECK-NEXT: vmov r5, s10 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: vmov r6, r7, d4 +; CHECK-NEXT: vmov r4, r3, d1 +; CHECK-NEXT: ldrb r5, [r1] +; CHECK-NEXT: ldrb r1, [r2] +; CHECK-NEXT: ldrb r2, [r6] +; CHECK-NEXT: ldrb.w r12, [r3] +; CHECK-NEXT: vmov.8 q0[0], r2 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: ldrb.w lr, [r4] +; CHECK-NEXT: ldrb r4, [r2] +; CHECK-NEXT: ldrb r2, [r3] +; CHECK-NEXT: ldrb r3, [r7] +; CHECK-NEXT: vmov.8 q0[1], r3 +; CHECK-NEXT: vmov r3, r6, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q0[2], r5 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q0[2], r3 +; CHECK-NEXT: vmov r0, r3, d4 ; CHECK-NEXT: vmov.8 q0[3], r6 -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov.8 q0[5], r3 +; CHECK-NEXT: vmov r0, r3, d5 ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: vmov.8 q0[7], r3 +; CHECK-NEXT: vmov r0, r3, d2 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], lr -; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov.8 q0[15], r2 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q0[9], r3 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.8 q0[12], r5 +; CHECK-NEXT: vmov.8 q0[13], r1 +; CHECK-NEXT: vmov.8 q0[14], lr +; CHECK-NEXT: vmov.8 q0[15], r12 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <16 x i8*>, <16 x i8*>* %offptr, align 4 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %offs, i32 2, <16 x i1> , <16 x i8> undef) @@ -481,36 +439,32 @@ define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_sext16(<8 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v8i8_sext16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r3, r1, d1 +; CHECK-NEXT: vmov r12, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov lr, r0, d1 +; CHECK-NEXT: ldrb r7, [r1] +; CHECK-NEXT: ldrb.w r1, [r12] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r5 +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrb.w r6, [lr] +; CHECK-NEXT: vmov.16 q0[1], r5 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.16 q0[2], r6 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.16 q0[4], r1 ; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r7 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> , <8 x i8> undef) @@ -521,36 +475,32 @@ define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_zext16(<8 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v8i8_zext16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r3, r1, d1 +; CHECK-NEXT: vmov r12, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov lr, r0, d1 +; CHECK-NEXT: ldrb r7, [r1] +; CHECK-NEXT: ldrb.w r1, [r12] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r5 +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrb.w r6, [lr] +; CHECK-NEXT: vmov.16 q0[1], r5 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.16 q0[2], r6 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.16 q0[4], r1 ; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r7 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> , <8 x i8> undef) @@ -562,16 +512,14 @@ ; CHECK-LABEL: ptr_v4i8_sext32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 ; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: bx lr @@ -586,18 +534,16 @@ ; CHECK-LABEL: ptr_v4i8_zext32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov.i32 q0, #0xff ; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r1 +; CHECK-NEXT: vand q0, q1, q0 ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 @@ -609,35 +555,31 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v8i8_sext32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r3, r12, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov r0, lr, d1 +; CHECK-NEXT: ldrb r7, [r2] +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: ldrb r6, [r1] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb.w r1, [r12] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r6 +; CHECK-NEXT: ldrb.w r5, [lr] +; CHECK-NEXT: vmov q1[3], q1[1], r1, r7 +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r2 -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> , <8 x i8> undef) @@ -648,34 +590,30 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_zext32(<8 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v8i8_zext32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: ldrb.w lr, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r12, r3, d0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r0, lr, d1 +; CHECK-NEXT: ldrb r7, [r2] +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb.w r2, [r12] ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb.w r6, [lr] +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov q2[2], q2[0], r3, lr -; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 +; CHECK-NEXT: vmov q2[3], q2[1], r3, r7 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r6 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 ; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> , <8 x i8> undef) @@ -792,37 +730,33 @@ define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i8_i8_toi64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.u32 q0, [r1, #4] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: ldrb r6, [r2] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb.w r12, [r12] +; CHECK-NEXT: ldrb.w r2, [lr] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r6 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 ; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov q1[3], q1[1], r2, r12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov q0[3], q0[1], r5, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll @@ -7,16 +7,14 @@ ; NOGATSCAT: @ %bb.0: @ %entry ; NOGATSCAT-NEXT: vldrw.u32 q0, [r1] ; NOGATSCAT-NEXT: vadd.i32 q0, q0, r0 -; NOGATSCAT-NEXT: vmov r0, s2 -; NOGATSCAT-NEXT: vmov r1, s0 -; NOGATSCAT-NEXT: vmov r2, s3 -; NOGATSCAT-NEXT: vmov r3, s1 +; NOGATSCAT-NEXT: vmov r0, r1, d1 +; NOGATSCAT-NEXT: vmov r2, r3, d0 ; NOGATSCAT-NEXT: ldr r0, [r0] -; NOGATSCAT-NEXT: ldr r1, [r1] ; NOGATSCAT-NEXT: ldr r2, [r2] +; NOGATSCAT-NEXT: ldr r1, [r1] ; NOGATSCAT-NEXT: ldr r3, [r3] -; NOGATSCAT-NEXT: vmov q0[2], q0[0], r1, r0 -; NOGATSCAT-NEXT: vmov q0[3], q0[1], r3, r2 +; NOGATSCAT-NEXT: vmov q0[2], q0[0], r2, r0 +; NOGATSCAT-NEXT: vmov q0[3], q0[1], r3, r1 ; NOGATSCAT-NEXT: bx lr ; ; NOMVE-LABEL: unscaled_i32_i32_gather: @@ -46,21 +44,19 @@ define arm_aapcs_vfpcc void @unscaled_i32_i8_scatter(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) { ; NOGATSCAT-LABEL: unscaled_i32_i8_scatter: ; NOGATSCAT: @ %bb.0: @ %entry +; NOGATSCAT-NEXT: .save {r4, r5, r7, lr} +; NOGATSCAT-NEXT: push {r4, r5, r7, lr} ; NOGATSCAT-NEXT: vldrb.u32 q1, [r1] -; NOGATSCAT-NEXT: vmov r1, s0 +; NOGATSCAT-NEXT: vmov r1, r3, d0 +; NOGATSCAT-NEXT: vmov r4, r5, d1 ; NOGATSCAT-NEXT: vadd.i32 q1, q1, r0 -; NOGATSCAT-NEXT: vmov r0, s4 -; NOGATSCAT-NEXT: str r1, [r0] -; NOGATSCAT-NEXT: vmov r0, s5 -; NOGATSCAT-NEXT: vmov r1, s1 +; NOGATSCAT-NEXT: vmov r0, r12, d2 +; NOGATSCAT-NEXT: vmov r2, lr, d3 ; NOGATSCAT-NEXT: str r1, [r0] -; NOGATSCAT-NEXT: vmov r0, s6 -; NOGATSCAT-NEXT: vmov r1, s2 -; NOGATSCAT-NEXT: str r1, [r0] -; NOGATSCAT-NEXT: vmov r0, s7 -; NOGATSCAT-NEXT: vmov r1, s3 -; NOGATSCAT-NEXT: str r1, [r0] -; NOGATSCAT-NEXT: bx lr +; NOGATSCAT-NEXT: str.w r3, [r12] +; NOGATSCAT-NEXT: str r4, [r2] +; NOGATSCAT-NEXT: str.w r5, [lr] +; NOGATSCAT-NEXT: pop {r4, r5, r7, pc} ; ; NOMVE-LABEL: unscaled_i32_i8_scatter: ; NOMVE: @ %bb.0: @ %entry Index: llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll +++ llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll @@ -4,62 +4,58 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C) { ; CHECK-LABEL: loads_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: vldrw.u32 q5, [r2] -; CHECK-NEXT: vldrw.u32 q6, [r1] +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vand q0, q1, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.i64 q3, #0xffffffff -; CHECK-NEXT: vmov.f32 s8, s20 +; CHECK-NEXT: vmov r4, r1, d4 +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov r5, s12 ; CHECK-NEXT: vmov.f32 s16, s22 -; CHECK-NEXT: vmov.f32 s10, s21 ; CHECK-NEXT: vmov.f32 s18, s23 -; CHECK-NEXT: vmov.f32 s20, s26 -; CHECK-NEXT: vmov.f32 s22, s27 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vand q5, q5, q3 -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: vmov.f32 s26, s25 -; CHECK-NEXT: vand q3, q6, q3 +; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: asrs r3, r0, #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: asrl r0, r1, r2 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r0, r12, d5 +; CHECK-NEXT: vmov.f32 s8, s20 +; CHECK-NEXT: vmov.f32 s10, s21 +; CHECK-NEXT: adds r2, r5, r4 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: asr.w r6, r5, #31 +; CHECK-NEXT: adcs r1, r6 +; CHECK-NEXT: asrl r2, r1, r4 ; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds r4, r1, r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: asr.w r12, r1, #31 -; CHECK-NEXT: adc.w r1, r12, r3 -; CHECK-NEXT: asrl r4, r1, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: adds r2, r2, r1 -; CHECK-NEXT: asr.w r12, r1, #31 -; CHECK-NEXT: adc.w r1, r12, r3 +; CHECK-NEXT: adds r6, r1, r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: asr.w r4, r1, #31 +; CHECK-NEXT: adc.w r1, r4, lr +; CHECK-NEXT: asrl r6, r1, r3 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: asr.w r3, r1, #31 +; CHECK-NEXT: adc.w r1, r3, r12 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: asrl r2, r1, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: asrl r0, r1, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: adds r6, r1, r5 +; CHECK-NEXT: asr.w r2, r1, #31 +; CHECK-NEXT: adc.w r1, r2, r4 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: asrl r6, r1, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r0 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %a = load <4 x i32>, <4 x i32> *%A, align 4 %b = load <4 x i32>, <4 x i32> *%B, align 4 @@ -142,63 +138,62 @@ define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D) { ; CHECK-LABEL: load_store_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: vldrw.u32 q5, [r2] -; CHECK-NEXT: vldrw.u32 q6, [r1] +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmov.f32 s10, s9 +; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vand q2, q2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov.i64 q3, #0xffffffff -; CHECK-NEXT: vmov.f32 s4, s20 +; CHECK-NEXT: vmov r5, r1, d2 +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov r6, s12 ; CHECK-NEXT: vmov.f32 s16, s22 -; CHECK-NEXT: vmov.f32 s6, s21 ; CHECK-NEXT: vmov.f32 s18, s23 -; CHECK-NEXT: vmov.f32 s20, s26 -; CHECK-NEXT: vmov.f32 s22, s27 -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vand q5, q5, q3 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: vmov.f32 s26, s25 -; CHECK-NEXT: vand q3, q6, q3 +; CHECK-NEXT: vmov r4, lr, d4 ; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov lr, s13 -; CHECK-NEXT: asr.w r12, r0, #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: asrl r0, r1, r2 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: vmov.f32 s4, s20 +; CHECK-NEXT: vmov.f32 s6, s21 +; CHECK-NEXT: adds r2, r6, r5 +; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: asr.w r7, r6, #31 +; CHECK-NEXT: adcs r1, r7 +; CHECK-NEXT: asrl r2, r1, r5 +; CHECK-NEXT: vmov r7, s4 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: adds r4, r1, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: asr.w r12, r1, #31 -; CHECK-NEXT: adc.w r1, r12, lr -; CHECK-NEXT: asrl r4, r1, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r0 -; CHECK-NEXT: vmov lr, s23 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r4, s15 -; CHECK-NEXT: adds r2, r2, r1 -; CHECK-NEXT: asr.w r12, r1, #31 -; CHECK-NEXT: adc.w r1, r12, lr -; CHECK-NEXT: vmov r12, s18 -; CHECK-NEXT: asrl r2, r1, r12 -; CHECK-NEXT: asr.w r12, r0, #31 +; CHECK-NEXT: adds r4, r4, r1 +; CHECK-NEXT: asr.w r5, r1, #31 +; CHECK-NEXT: adc.w r1, r5, lr +; CHECK-NEXT: asrl r4, r1, r7 +; CHECK-NEXT: vmov r6, r5, d5 ; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 ; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r12, r4 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: asrl r0, r1, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r2 +; CHECK-NEXT: asr.w r7, r1, #31 +; CHECK-NEXT: adc.w r1, r7, r12 +; CHECK-NEXT: vmov r7, s18 +; CHECK-NEXT: asrl r0, r1, r7 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: adds r6, r6, r1 +; CHECK-NEXT: asr.w r2, r1, #31 +; CHECK-NEXT: adc.w r1, r2, r5 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: asrl r6, r1, r2 +; CHECK-NEXT: vmov q2[3], q2[1], r6, r0 ; CHECK-NEXT: vstrw.32 q2, [r3] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %a = load <4 x i32>, <4 x i32> *%A, align 4 %b = load <4 x i32>, <4 x i32> *%B, align 4 @@ -377,33 +372,31 @@ ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: ldr.w lr, [sp, #20] ; CHECK-NEXT: vmov.f32 s8, s0 ; CHECK-NEXT: vmov.f32 s12, s4 -; CHECK-NEXT: vmov.f32 s10, s1 ; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: smull r12, r3, r1, r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: vmov r4, s14 ; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov.f32 s10, s3 ; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: asrl r12, r3, r2 ; CHECK-NEXT: vmov.f32 s2, s7 ; CHECK-NEXT: vmullb.s32 q1, q0, q2 -; CHECK-NEXT: vmov r7, s7 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: smull r0, r3, r1, r0 -; CHECK-NEXT: ldr r1, [sp, #20] -; CHECK-NEXT: asrl r0, r3, r2 -; CHECK-NEXT: smull r12, r5, r4, r5 -; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov r6, r1, d2 +; CHECK-NEXT: vmov r4, r7, d3 +; CHECK-NEXT: asrl r6, r1, r2 ; CHECK-NEXT: asrl r4, r7, r2 -; CHECK-NEXT: vmov r7, s5 -; CHECK-NEXT: asrl r6, r7, r2 -; CHECK-NEXT: asrl r12, r5, r2 +; CHECK-NEXT: smull r0, r5, r5, r0 +; CHECK-NEXT: asrl r0, r5, r2 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r6 ; CHECK-NEXT: vmov q0[3], q0[1], r12, r4 -; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [lr] ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %a = load <4 x i32>, <4 x i32> *%A, align 4 Index: llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -180,52 +180,45 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: ext_add_ashr_trunc_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s16, s6 -; CHECK-NEXT: vmov.i64 q3, #0xffffffff -; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vand q4, q4, q3 ; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vand q3, q1, q3 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov lr, s19 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r12, lr, d7 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: asr.w r5, r4, #31 +; CHECK-NEXT: adcs r1, r5 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r3, s13 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: adds r2, r2, r1 -; CHECK-NEXT: asr.w r12, r1, #31 -; CHECK-NEXT: adc.w r1, r12, r3 -; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: asr.w r4, r1, #31 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: lsrl r2, r3, #1 +; CHECK-NEXT: vmov r1, r5, d3 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds r4, r1, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: asr.w r12, r1, #31 -; CHECK-NEXT: adc.w r1, r12, lr -; CHECK-NEXT: lsrl r4, r1, #1 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds.w r4, r3, r12 +; CHECK-NEXT: asr.w r6, r3, #31 +; CHECK-NEXT: adc.w r3, r6, lr +; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r5 +; CHECK-NEXT: lsrl r4, r3, #1 ; CHECK-NEXT: lsrl r0, r1, #1 ; CHECK-NEXT: vmov q1[3], q1[1], r0, r4 ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %sa = sext <4 x i32> %a to <4 x i64> %sb = zext <4 x i32> %b to <4 x i64> @@ -382,114 +375,115 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.i64 q4, #0xffffffff +; CHECK-NEXT: vmov.i64 q3, #0xffffffff ; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vand q2, q2, q3 +; CHECK-NEXT: vand q1, q1, q3 ; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vand q2, q2, q4 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov r10, s12 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vand q1, q1, q4 ; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r9, s0 -; CHECK-NEXT: vmov r11, s4 -; CHECK-NEXT: vmov r7, s5 -; CHECK-NEXT: adds.w r2, r10, r4 -; CHECK-NEXT: asr.w r0, r10, #31 -; CHECK-NEXT: adc.w r5, r0, r1 -; CHECK-NEXT: asrl r2, r5, r4 -; CHECK-NEXT: subs r6, r2, r4 -; CHECK-NEXT: sbc.w r12, r5, r1 -; CHECK-NEXT: adds.w r0, r9, r11 -; CHECK-NEXT: asr.w r2, r9, #31 -; CHECK-NEXT: adc.w r3, r2, r7 -; CHECK-NEXT: umull r8, r5, r6, r4 -; CHECK-NEXT: asrl r0, r3, r11 -; CHECK-NEXT: subs.w r0, r0, r11 -; CHECK-NEXT: mla r5, r12, r4, r5 -; CHECK-NEXT: sbc.w r12, r3, r7 -; CHECK-NEXT: umull r2, r1, r0, r11 -; CHECK-NEXT: muls r0, r7, r0 -; CHECK-NEXT: vmov r7, s14 -; CHECK-NEXT: orr.w lr, r1, r0 -; CHECK-NEXT: rsb.w r0, r10, #0 -; CHECK-NEXT: lsll r8, r5, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: lsll r8, r5, r4 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: eor.w r4, r4, r10 -; CHECK-NEXT: orr.w r4, r4, r10, asr #31 -; CHECK-NEXT: asrs r3, r7, #31 -; CHECK-NEXT: adds r6, r7, r0 -; CHECK-NEXT: adcs r3, r1 -; CHECK-NEXT: asrl r6, r3, r0 -; CHECK-NEXT: subs r6, r6, r0 -; CHECK-NEXT: sbc.w r1, r3, r1 -; CHECK-NEXT: umull r6, r3, r6, r0 -; CHECK-NEXT: mla r1, r1, r0, r3 -; CHECK-NEXT: rsbs r3, r7, #0 -; CHECK-NEXT: lsll r6, r1, r3 -; CHECK-NEXT: lsll r6, r1, r0 -; CHECK-NEXT: eors r0, r7 -; CHECK-NEXT: vmov q3[2], q3[0], r8, r6 -; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: orr.w r0, r0, r7, asr #31 -; CHECK-NEXT: adds.w r8, r6, r5 -; CHECK-NEXT: eor.w r7, r6, r5 -; CHECK-NEXT: asr.w r3, r6, #31 -; CHECK-NEXT: orr.w r7, r7, r6, asr #31 -; CHECK-NEXT: adcs r3, r1 -; CHECK-NEXT: asrl r8, r3, r5 -; CHECK-NEXT: subs.w r8, r8, r5 -; CHECK-NEXT: sbcs r3, r1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r0, eq -; CHECK-NEXT: mul r1, r8, r1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: vmov r12, r2, d5 +; CHECK-NEXT: vmov r8, r9, d3 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov lr, s2 +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: adds.w r4, r1, r12 +; CHECK-NEXT: asr.w r0, r1, #31 +; CHECK-NEXT: adc.w r5, r0, r2 +; CHECK-NEXT: asrl r4, r5, r12 +; CHECK-NEXT: subs.w r0, r4, r12 +; CHECK-NEXT: sbc.w r2, r5, r2 +; CHECK-NEXT: asr.w r5, lr, #31 +; CHECK-NEXT: umull r0, r4, r0, r12 +; CHECK-NEXT: adds.w r6, lr, r8 +; CHECK-NEXT: adc.w r5, r5, r9 +; CHECK-NEXT: asrl r6, r5, r8 +; CHECK-NEXT: mla r3, r2, r12, r4 +; CHECK-NEXT: subs.w r7, r6, r8 +; CHECK-NEXT: sbc.w r10, r5, r9 +; CHECK-NEXT: rsbs r2, r1, #0 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: lsll r0, r3, r2 +; CHECK-NEXT: vmov r6, r2, d4 +; CHECK-NEXT: lsll r0, r3, r12 +; CHECK-NEXT: asrs r3, r5, #31 +; CHECK-NEXT: adds r4, r5, r6 +; CHECK-NEXT: adcs r3, r2 +; CHECK-NEXT: asrl r4, r3, r6 +; CHECK-NEXT: subs r4, r4, r6 +; CHECK-NEXT: sbc.w r2, r3, r2 +; CHECK-NEXT: umull r4, r3, r4, r6 +; CHECK-NEXT: mla r3, r2, r6, r3 +; CHECK-NEXT: rsbs r2, r5, #0 +; CHECK-NEXT: lsll r4, r3, r2 +; CHECK-NEXT: lsll r4, r3, r6 +; CHECK-NEXT: eors r6, r5 +; CHECK-NEXT: vmov q3[2], q3[0], r4, r0 +; CHECK-NEXT: umull r2, r0, r7, r8 +; CHECK-NEXT: orr.w r6, r6, r5, asr #31 +; CHECK-NEXT: mul r3, r7, r9 +; CHECK-NEXT: vmov r7, s0 +; CHECK-NEXT: orrs r0, r3 +; CHECK-NEXT: vmov r3, r4, d2 +; CHECK-NEXT: mla r11, r10, r8, r0 +; CHECK-NEXT: asr.w r9, r7, #31 +; CHECK-NEXT: adds r0, r7, r3 +; CHECK-NEXT: adc.w r9, r9, r4 +; CHECK-NEXT: asrl r0, r9, r3 +; CHECK-NEXT: subs.w r10, r0, r3 +; CHECK-NEXT: sbc.w r9, r9, r4 +; CHECK-NEXT: umull r0, r1, r10, r3 +; CHECK-NEXT: mul r4, r10, r4 +; CHECK-NEXT: orr.w r10, r1, r4 +; CHECK-NEXT: eor.w r1, lr, r8 +; CHECK-NEXT: orr.w r1, r1, lr, asr #31 +; CHECK-NEXT: eor.w r4, r7, r3 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: orr.w r4, r4, r7, asr #31 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: rsbs r7, r7, #0 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: cset r4, eq ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: vmov.32 q4[1], r4 -; CHECK-NEXT: vmov q4[2], q4[0], r4, r0 -; CHECK-NEXT: umull r4, r0, r8, r5 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: eor.w r1, r9, r11 -; CHECK-NEXT: orr.w r1, r1, r9, asr #31 +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r1 +; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vbic q4, q1, q0 +; CHECK-NEXT: eor.w r1, r4, r12 +; CHECK-NEXT: orr.w r1, r1, r4, asr #31 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: cset r7, eq -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csetm r7, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r7 -; CHECK-NEXT: mla r7, r3, r5, r0 -; CHECK-NEXT: rsbs r1, r6, #0 -; CHECK-NEXT: vbic q5, q1, q0 -; CHECK-NEXT: mla r3, r12, r11, lr -; CHECK-NEXT: rsb.w r0, r9, #0 -; CHECK-NEXT: lsll r4, r7, r1 -; CHECK-NEXT: vbic q1, q2, q4 -; CHECK-NEXT: lsll r2, r3, r0 -; CHECK-NEXT: vand q2, q3, q4 -; CHECK-NEXT: lsll r4, r7, r5 -; CHECK-NEXT: lsll r2, r3, r11 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cset r6, eq +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: csetm r6, ne +; CHECK-NEXT: vmov.32 q5[1], r6 +; CHECK-NEXT: vmov q5[2], q5[0], r6, r1 +; CHECK-NEXT: mla r1, r9, r3, r10 +; CHECK-NEXT: rsb.w r6, lr, #0 +; CHECK-NEXT: vbic q1, q2, q5 +; CHECK-NEXT: lsll r2, r11, r6 +; CHECK-NEXT: lsll r0, r1, r7 +; CHECK-NEXT: vand q2, q3, q5 +; CHECK-NEXT: lsll r2, r11, r8 +; CHECK-NEXT: lsll r0, r1, r3 ; CHECK-NEXT: vorr q1, q2, q1 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 +; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 ; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q5 +; CHECK-NEXT: vorr q0, q0, q4 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} Index: llvm/test/CodeGen/Thumb2/mve-masked-load.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-masked-load.ll +++ llvm/test/CodeGen/Thumb2/mve-masked-load.ll @@ -1750,28 +1750,28 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(<2 x i64> *%dest, <2 x i64> %a) { ; CHECK-LE-LABEL: masked_v2i64_align4_zero: ; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r3, s0 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: vmov r1, s1 -; CHECK-LE-NEXT: vmov r12, s3 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: vmov r3, s2 -; CHECK-LE-NEXT: sbcs.w r1, r2, r1 +; CHECK-LE-NEXT: vmov r1, r2, d0 +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: vmov lr, r12, d1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r3, r2 ; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: it lt ; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r2, r12 +; CHECK-LE-NEXT: rsbs.w r2, lr, #0 +; CHECK-LE-NEXT: sbcs.w r2, r3, r12 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r2, #1 -; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: movlt r3, #1 +; CHECK-LE-NEXT: cmp r3, #0 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: mvnne r3, #1 +; CHECK-LE-NEXT: bfi r3, r1, #0, #1 +; CHECK-LE-NEXT: and r1, r3, #3 +; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: beq .LBB49_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load ; CHECK-LE-NEXT: vldr d1, .LCPI49_0 @@ -1784,7 +1784,7 @@ ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vldrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: pop {r7, pc} ; CHECK-LE-NEXT: .p2align 3 ; CHECK-LE-NEXT: @ %bb.4: ; CHECK-LE-NEXT: .LCPI49_0: @@ -1793,29 +1793,29 @@ ; ; CHECK-BE-LABEL: masked_v2i64_align4_zero: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: vmov r3, s7 -; CHECK-BE-NEXT: vmov r1, s6 -; CHECK-BE-NEXT: vmov r12, s4 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: vmov r3, s5 -; CHECK-BE-NEXT: sbcs.w r1, r2, r1 +; CHECK-BE-NEXT: movs r3, #0 +; CHECK-BE-NEXT: vmov r1, r2, d3 +; CHECK-BE-NEXT: vmov r12, lr, d2 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, r1 ; CHECK-BE-NEXT: mov.w r1, #0 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt r1, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: sbcs.w r3, r2, r12 +; CHECK-BE-NEXT: rsbs.w r2, lr, #0 +; CHECK-BE-NEXT: sbcs.w r2, r3, r12 ; CHECK-BE-NEXT: it lt -; CHECK-BE-NEXT: movlt r2, #1 -; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: movlt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 ; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r2, #1 -; CHECK-BE-NEXT: bfi r2, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r2, #3 -; CHECK-BE-NEXT: lsls r2, r2, #30 +; CHECK-BE-NEXT: mvnne r3, #1 +; CHECK-BE-NEXT: bfi r3, r1, #0, #1 +; CHECK-BE-NEXT: and r1, r3, #3 +; CHECK-BE-NEXT: lsls r2, r3, #30 ; CHECK-BE-NEXT: bpl .LBB49_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load ; CHECK-BE-NEXT: vldr d1, .LCPI49_0 @@ -1828,7 +1828,7 @@ ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vldrne d1, [r0, #8] ; CHECK-BE-NEXT: add sp, #4 -; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: pop {r7, pc} ; CHECK-BE-NEXT: .p2align 3 ; CHECK-BE-NEXT: @ %bb.4: ; CHECK-BE-NEXT: .LCPI49_0: @@ -1843,28 +1843,28 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(<2 x double> *%dest, <2 x double> %a, <2 x i64> %b) { ; CHECK-LE-LABEL: masked_v2f64_align4_zero: ; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r3, s4 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: vmov r1, s5 -; CHECK-LE-NEXT: vmov r12, s7 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: vmov r3, s6 -; CHECK-LE-NEXT: sbcs.w r1, r2, r1 +; CHECK-LE-NEXT: vmov r1, r2, d2 +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: vmov lr, r12, d3 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r3, r2 ; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: it lt ; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r2, r12 +; CHECK-LE-NEXT: rsbs.w r2, lr, #0 +; CHECK-LE-NEXT: sbcs.w r2, r3, r12 ; CHECK-LE-NEXT: it lt -; CHECK-LE-NEXT: movlt r2, #1 -; CHECK-LE-NEXT: cmp r2, #0 +; CHECK-LE-NEXT: movlt r3, #1 +; CHECK-LE-NEXT: cmp r3, #0 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: mvnne r2, #1 -; CHECK-LE-NEXT: bfi r2, r1, #0, #1 -; CHECK-LE-NEXT: and r1, r2, #3 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: mvnne r3, #1 +; CHECK-LE-NEXT: bfi r3, r1, #0, #1 +; CHECK-LE-NEXT: and r1, r3, #3 +; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: beq .LBB50_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load ; CHECK-LE-NEXT: vldr d1, .LCPI50_0 @@ -1877,7 +1877,7 @@ ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vldrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: pop {r7, pc} ; CHECK-LE-NEXT: .p2align 3 ; CHECK-LE-NEXT: @ %bb.4: ; CHECK-LE-NEXT: .LCPI50_0: @@ -1886,29 +1886,29 @@ ; ; CHECK-BE-LABEL: masked_v2f64_align4_zero: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q0, q1 -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: vmov r3, s3 -; CHECK-BE-NEXT: vmov r1, s2 -; CHECK-BE-NEXT: vmov r12, s0 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: vmov r3, s1 -; CHECK-BE-NEXT: sbcs.w r1, r2, r1 +; CHECK-BE-NEXT: movs r3, #0 +; CHECK-BE-NEXT: vmov r1, r2, d1 +; CHECK-BE-NEXT: vmov r12, lr, d0 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: sbcs.w r1, r3, r1 ; CHECK-BE-NEXT: mov.w r1, #0 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt r1, #1 -; CHECK-BE-NEXT: rsbs r3, r3, #0 -; CHECK-BE-NEXT: sbcs.w r3, r2, r12 +; CHECK-BE-NEXT: rsbs.w r2, lr, #0 +; CHECK-BE-NEXT: sbcs.w r2, r3, r12 ; CHECK-BE-NEXT: it lt -; CHECK-BE-NEXT: movlt r2, #1 -; CHECK-BE-NEXT: cmp r2, #0 +; CHECK-BE-NEXT: movlt r3, #1 +; CHECK-BE-NEXT: cmp r3, #0 ; CHECK-BE-NEXT: it ne -; CHECK-BE-NEXT: mvnne r2, #1 -; CHECK-BE-NEXT: bfi r2, r1, #0, #1 -; CHECK-BE-NEXT: and r1, r2, #3 -; CHECK-BE-NEXT: lsls r2, r2, #30 +; CHECK-BE-NEXT: mvnne r3, #1 +; CHECK-BE-NEXT: bfi r3, r1, #0, #1 +; CHECK-BE-NEXT: and r1, r3, #3 +; CHECK-BE-NEXT: lsls r2, r3, #30 ; CHECK-BE-NEXT: bpl .LBB50_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load ; CHECK-BE-NEXT: vldr d1, .LCPI50_0 @@ -1921,7 +1921,7 @@ ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vldrne d1, [r0, #8] ; CHECK-BE-NEXT: add sp, #4 -; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: pop {r7, pc} ; CHECK-BE-NEXT: .p2align 3 ; CHECK-BE-NEXT: @ %bb.4: ; CHECK-BE-NEXT: .LCPI50_0: Index: llvm/test/CodeGen/Thumb2/mve-masked-store.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -935,19 +935,19 @@ define arm_aapcs_vfpcc void @masked_v2i64(<2 x i64> *%dest, <2 x i64> %a) { ; CHECK-LE-LABEL: masked_v2i64: ; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r2, s0 +; CHECK-LE-NEXT: vmov r1, r2, d0 ; CHECK-LE-NEXT: movs r3, #0 -; CHECK-LE-NEXT: vmov r1, s1 -; CHECK-LE-NEXT: vmov r12, s3 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: vmov r2, s2 -; CHECK-LE-NEXT: sbcs.w r1, r3, r1 +; CHECK-LE-NEXT: vmov lr, r12, d1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r3, r2 ; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: it lt ; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsbs.w r2, lr, #0 ; CHECK-LE-NEXT: sbcs.w r2, r3, r12 ; CHECK-LE-NEXT: it lt ; CHECK-LE-NEXT: movlt r3, #1 @@ -963,24 +963,24 @@ ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vstrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: pop {r7, pc} ; ; CHECK-BE-LABEL: masked_v2i64: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: movs r3, #0 -; CHECK-BE-NEXT: vmov r2, s7 -; CHECK-BE-NEXT: vmov r1, s6 -; CHECK-BE-NEXT: vmov r12, s4 +; CHECK-BE-NEXT: vmov r1, r2, d3 +; CHECK-BE-NEXT: vmov r12, lr, d2 ; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: vmov r2, s5 ; CHECK-BE-NEXT: sbcs.w r1, r3, r1 ; CHECK-BE-NEXT: mov.w r1, #0 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt r1, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: rsbs.w r2, lr, #0 ; CHECK-BE-NEXT: sbcs.w r2, r3, r12 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt r3, #1 @@ -996,7 +996,7 @@ ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vstrne d1, [r0, #8] ; CHECK-BE-NEXT: add sp, #4 -; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: pop {r7, pc} entry: %c = icmp sgt <2 x i64> %a, zeroinitializer call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %a, <2 x i64>* %dest, i32 8, <2 x i1> %c) @@ -1006,19 +1006,19 @@ define arm_aapcs_vfpcc void @masked_v2f64(<2 x double> *%dest, <2 x double> %a, <2 x i64> %b) { ; CHECK-LE-LABEL: masked_v2f64: ; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r2, s4 +; CHECK-LE-NEXT: vmov r1, r2, d2 ; CHECK-LE-NEXT: movs r3, #0 -; CHECK-LE-NEXT: vmov r1, s5 -; CHECK-LE-NEXT: vmov r12, s7 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: vmov r2, s6 -; CHECK-LE-NEXT: sbcs.w r1, r3, r1 +; CHECK-LE-NEXT: vmov lr, r12, d3 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r3, r2 ; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: it lt ; CHECK-LE-NEXT: movlt r1, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsbs.w r2, lr, #0 ; CHECK-LE-NEXT: sbcs.w r2, r3, r12 ; CHECK-LE-NEXT: it lt ; CHECK-LE-NEXT: movlt r3, #1 @@ -1034,24 +1034,24 @@ ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vstrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: pop {r7, pc} ; ; CHECK-BE-LABEL: masked_v2f64: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q2, q1 ; CHECK-BE-NEXT: movs r3, #0 -; CHECK-BE-NEXT: vmov r2, s11 -; CHECK-BE-NEXT: vmov r1, s10 -; CHECK-BE-NEXT: vmov r12, s8 +; CHECK-BE-NEXT: vmov r1, r2, d5 +; CHECK-BE-NEXT: vmov r12, lr, d4 ; CHECK-BE-NEXT: rsbs r2, r2, #0 -; CHECK-BE-NEXT: vmov r2, s9 ; CHECK-BE-NEXT: sbcs.w r1, r3, r1 ; CHECK-BE-NEXT: mov.w r1, #0 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt r1, #1 -; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: rsbs.w r2, lr, #0 ; CHECK-BE-NEXT: sbcs.w r2, r3, r12 ; CHECK-BE-NEXT: it lt ; CHECK-BE-NEXT: movlt r3, #1 @@ -1067,7 +1067,7 @@ ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vstrne d1, [r0, #8] ; CHECK-BE-NEXT: add sp, #4 -; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: pop {r7, pc} entry: %c = icmp sgt <2 x i64> %b, zeroinitializer call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %a, <2 x double>* %dest, i32 8, <2 x i1> %c) Index: llvm/test/CodeGen/Thumb2/mve-minmax.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-minmax.ll +++ llvm/test/CodeGen/Thumb2/mve-minmax.ll @@ -38,36 +38,32 @@ define arm_aapcs_vfpcc <2 x i64> @smin_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK-LABEL: smin_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r12, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: sbcs.w r1, r1, r12 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: sbcs.w r2, lr, r12 +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r12, lr, d2 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: subs.w r1, r4, r12 +; CHECK-NEXT: sbcs.w r1, r5, lr +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %0 = icmp slt <2 x i64> %s1, %s2 %1 = select <2 x i1> %0, <2 x i64> %s1, <2 x i64> %s2 @@ -110,36 +106,32 @@ define arm_aapcs_vfpcc <2 x i64> @umin_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK-LABEL: umin_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r12, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: sbcs.w r1, r1, r12 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: sbcs.w r2, lr, r12 +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r12, lr, d2 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: subs.w r1, r4, r12 +; CHECK-NEXT: sbcs.w r1, r5, lr +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %0 = icmp ult <2 x i64> %s1, %s2 %1 = select <2 x i1> %0, <2 x i64> %s1, <2 x i64> %s2 @@ -183,36 +175,32 @@ define arm_aapcs_vfpcc <2 x i64> @smax_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK-LABEL: smax_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vmov lr, s5 -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: sbcs.w r1, r1, r12 -; CHECK-NEXT: vmov r12, s1 -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: sbcs.w r2, lr, r12 +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov r12, lr, d0 +; CHECK-NEXT: vmov r4, r5, d2 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: subs.w r1, r4, r12 +; CHECK-NEXT: sbcs.w r1, r5, lr +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %0 = icmp sgt <2 x i64> %s1, %s2 %1 = select <2 x i1> %0, <2 x i64> %s1, <2 x i64> %s2 @@ -255,36 +243,32 @@ define arm_aapcs_vfpcc <2 x i64> @umax_v2i64(<2 x i64> %s1, <2 x i64> %s2) { ; CHECK-LABEL: umax_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vmov lr, s5 -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: sbcs.w r1, r1, r12 -; CHECK-NEXT: vmov r12, s1 -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: sbcs.w r2, lr, r12 +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov r12, lr, d0 +; CHECK-NEXT: vmov r4, r5, d2 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: subs.w r1, r4, r12 +; CHECK-NEXT: sbcs.w r1, r5, lr +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %0 = icmp ugt <2 x i64> %s1, %s2 %1 = select <2 x i1> %0, <2 x i64> %s1, <2 x i64> %s2 Index: llvm/test/CodeGen/Thumb2/mve-neg.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-neg.ll +++ llvm/test/CodeGen/Thumb2/mve-neg.ll @@ -34,17 +34,15 @@ define arm_aapcs_vfpcc <2 x i64> @neg_v2i64(<2 x i64> %s1) { ; CHECK-LABEL: neg_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: sbc.w r0, r12, r0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: sbc.w r3, r12, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r0 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbc.w r1, r12, r1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: sbc.w r2, r12, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 ; CHECK-NEXT: bx lr entry: %0 = sub nsw <2 x i64> zeroinitializer, %s1 Index: llvm/test/CodeGen/Thumb2/mve-nofloat.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-nofloat.ll +++ llvm/test/CodeGen/Thumb2/mve-nofloat.ll @@ -102,31 +102,31 @@ define arm_aapcs_vfpcc <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) { ; CHECK-NOFP-LABEL: vector_add_f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: .save {r7, lr} -; CHECK-NOFP-NEXT: push {r7, lr} +; CHECK-NOFP-NEXT: .save {r4, r5, r7, lr} +; CHECK-NOFP-NEXT: push {r4, r5, r7, lr} ; CHECK-NOFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NOFP-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NOFP-NEXT: vmov q4, q1 +; CHECK-NOFP-NEXT: vmov q5, q1 ; CHECK-NOFP-NEXT: vmov q6, q0 -; CHECK-NOFP-NEXT: vmov r0, s27 -; CHECK-NOFP-NEXT: vmov r1, s19 +; CHECK-NOFP-NEXT: vmov r4, r0, d13 +; CHECK-NOFP-NEXT: vmov r5, r1, d11 ; CHECK-NOFP-NEXT: bl __aeabi_fadd -; CHECK-NOFP-NEXT: vmov s23, r0 -; CHECK-NOFP-NEXT: vmov r0, s26 -; CHECK-NOFP-NEXT: vmov r1, s18 +; CHECK-NOFP-NEXT: vmov s19, r0 +; CHECK-NOFP-NEXT: mov r0, r4 +; CHECK-NOFP-NEXT: mov r1, r5 ; CHECK-NOFP-NEXT: bl __aeabi_fadd -; CHECK-NOFP-NEXT: vmov s22, r0 -; CHECK-NOFP-NEXT: vmov r0, s25 -; CHECK-NOFP-NEXT: vmov r1, s17 +; CHECK-NOFP-NEXT: vmov s18, r0 +; CHECK-NOFP-NEXT: vmov r4, r0, d12 +; CHECK-NOFP-NEXT: vmov r5, r1, d10 ; CHECK-NOFP-NEXT: bl __aeabi_fadd -; CHECK-NOFP-NEXT: vmov s21, r0 -; CHECK-NOFP-NEXT: vmov r0, s24 -; CHECK-NOFP-NEXT: vmov r1, s16 +; CHECK-NOFP-NEXT: vmov s17, r0 +; CHECK-NOFP-NEXT: mov r0, r4 +; CHECK-NOFP-NEXT: mov r1, r5 ; CHECK-NOFP-NEXT: bl __aeabi_fadd -; CHECK-NOFP-NEXT: vmov s20, r0 -; CHECK-NOFP-NEXT: vmov q0, q5 +; CHECK-NOFP-NEXT: vmov s16, r0 +; CHECK-NOFP-NEXT: vmov q0, q4 ; CHECK-NOFP-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NOFP-NEXT: pop {r7, pc} +; CHECK-NOFP-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-FP-LABEL: vector_add_f32: ; CHECK-FP: @ %bb.0: @ %entry Index: llvm/test/CodeGen/Thumb2/mve-phireg.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -6,54 +6,52 @@ define arm_aapcs_vfpcc void @k() { ; CHECK-LABEL: k: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: adr r5, .LCPI0_0 -; CHECK-NEXT: adr r4, .LCPI0_1 -; CHECK-NEXT: vldrw.u32 q5, [r5] -; CHECK-NEXT: vldrw.u32 q6, [r4] +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: adr.w r8, .LCPI0_0 +; CHECK-NEXT: adr.w r9, .LCPI0_1 +; CHECK-NEXT: vldrw.u32 q6, [r8] +; CHECK-NEXT: vldrw.u32 q5, [r9] ; CHECK-NEXT: vmov.i32 q0, #0x1 ; CHECK-NEXT: vmov.i8 q1, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vmov.i16 q3, #0x6 ; CHECK-NEXT: vmov.i16 q4, #0x3 -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vand q5, q5, q0 ; CHECK-NEXT: vand q6, q6, q0 +; CHECK-NEXT: vand q5, q5, q0 +; CHECK-NEXT: vcmp.i32 eq, q6, zr +; CHECK-NEXT: cmp.w r12, #0 +; CHECK-NEXT: vpsel q6, q2, q1 ; CHECK-NEXT: vcmp.i32 eq, q5, zr ; CHECK-NEXT: vpsel q5, q2, q1 -; CHECK-NEXT: vcmp.i32 eq, q6, zr -; CHECK-NEXT: vpsel q7, q2, q1 -; CHECK-NEXT: vmov r1, s28 -; CHECK-NEXT: vmov.16 q6[0], r1 -; CHECK-NEXT: vmov r1, s29 -; CHECK-NEXT: vmov.16 q6[1], r1 -; CHECK-NEXT: vmov r1, s30 -; CHECK-NEXT: vmov.16 q6[2], r1 -; CHECK-NEXT: vmov r1, s31 -; CHECK-NEXT: vmov.16 q6[3], r1 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: vmov.16 q6[4], r1 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: vmov.16 q6[5], r1 -; CHECK-NEXT: vmov r1, s22 -; CHECK-NEXT: vmov.16 q6[6], r1 -; CHECK-NEXT: vmov r1, s23 -; CHECK-NEXT: vmov.16 q6[7], r1 -; CHECK-NEXT: vcmp.i16 ne, q6, zr -; CHECK-NEXT: vmov.i32 q6, #0x0 -; CHECK-NEXT: vpsel q5, q4, q3 -; CHECK-NEXT: vstrh.16 q5, [r0] -; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: cbz r0, .LBB0_2 -; CHECK-NEXT: le .LBB0_1 -; CHECK-NEXT: .LBB0_2: @ %for.cond4.preheader +; CHECK-NEXT: vmov r4, r0, d12 +; CHECK-NEXT: vmov r3, r6, d10 +; CHECK-NEXT: vmov r1, r2, d11 +; CHECK-NEXT: vmov.16 q5[0], r3 +; CHECK-NEXT: vmov.16 q5[1], r6 +; CHECK-NEXT: vmov r5, r7, d13 +; CHECK-NEXT: vmov.16 q5[2], r1 +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov.16 q5[4], r4 +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.16 q5[6], r5 +; CHECK-NEXT: vmov.16 q5[7], r7 +; CHECK-NEXT: vcmp.i16 ne, q5, zr +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vpsel q6, q4, q3 +; CHECK-NEXT: vstrh.16 q6, [r0] +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond4.preheader ; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: cbnz r6, .LBB0_5 ; CHECK-NEXT: .LBB0_3: @ %for.body10 @@ -63,8 +61,8 @@ ; CHECK-NEXT: .LBB0_4: @ %for.cond4.loopexit ; CHECK-NEXT: bl l ; CHECK-NEXT: .LBB0_5: @ %vector.body105.preheader -; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r8] +; CHECK-NEXT: vldrw.u32 q1, [r9] ; CHECK-NEXT: vmov.i32 q2, #0x8 ; CHECK-NEXT: .LBB0_6: @ %vector.body105 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -73,7 +71,7 @@ ; CHECK-NEXT: cbz r6, .LBB0_7 ; CHECK-NEXT: le .LBB0_6 ; CHECK-NEXT: .LBB0_7: @ %vector.body115.ph -; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r9] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: @APP ; CHECK-NEXT: nop Index: llvm/test/CodeGen/Thumb2/mve-pred-and.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-and.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-and.ll @@ -575,11 +575,9 @@ ; CHECK-LABEL: cmpeqz_v2i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vorr q2, q0, q1 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -604,33 +602,27 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: cmpeq_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: eors r1, r3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r12, r2, d4 +; CHECK-NEXT: vmov r3, r1, d2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: eors r2, r3 +; CHECK-NEXT: eor.w r2, r3, r12 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -656,29 +648,25 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqr_v2i1(<2 x i64> %a, <2 x i64> %b, i64 %c) { ; CHECK-LABEL: cmpeqr_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: eors r2, r1 -; CHECK-NEXT: eors r3, r0 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: eors r3, r1 +; CHECK-NEXT: eors r2, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmov r12, r3, d2 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: eor.w r0, r0, r12 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 ; CHECK-NEXT: vmov q2[3], q2[1], r0, r2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne Index: llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll @@ -389,11 +389,9 @@ ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r0, s1 -; CHECK-LE-NEXT: vmov r1, s0 -; CHECK-LE-NEXT: vmov r2, s2 +; CHECK-LE-NEXT: vmov r0, r1, d0 ; CHECK-LE-NEXT: orrs r0, r1 -; CHECK-LE-NEXT: vmov r1, s3 +; CHECK-LE-NEXT: vmov r1, r2, d1 ; CHECK-LE-NEXT: cset r0, eq ; CHECK-LE-NEXT: orrs r1, r2 ; CHECK-LE-NEXT: cset r1, eq @@ -410,11 +408,9 @@ ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov r0, s6 -; CHECK-BE-NEXT: vmov r1, s7 -; CHECK-BE-NEXT: vmov r2, s5 +; CHECK-BE-NEXT: vmov r0, r1, d3 ; CHECK-BE-NEXT: orrs r0, r1 -; CHECK-BE-NEXT: vmov r1, s4 +; CHECK-BE-NEXT: vmov r1, r2, d2 ; CHECK-BE-NEXT: cset r0, eq ; CHECK-BE-NEXT: orrs r1, r2 ; CHECK-BE-NEXT: cset r1, eq Index: llvm/test/CodeGen/Thumb2/mve-pred-ext.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-ext.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-ext.ll @@ -46,23 +46,21 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i1_v2i64(<2 x i64> %src) { ; CHECK-LABEL: sext_v2i1_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: sbcs.w r0, r2, r0 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: vmov r2, r12, d0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbcs.w r0, r3, r1 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: sbcs.w r1, r2, r1 +; CHECK-NEXT: rsbs r1, r2, #0 +; CHECK-NEXT: sbcs.w r1, r3, r12 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 @@ -119,29 +117,29 @@ define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2i64(<2 x i64> %src) { ; CHECK-LABEL: zext_v2i1_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adr r1, .LCPI7_0 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: sbcs.w r1, r0, r1 -; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: sbcs.w r2, r0, r2 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: vmov lr, r12, d0 +; CHECK-NEXT: adr r2, .LCPI7_0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: sbcs.w r0, r3, r1 +; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: bx lr +; CHECK-NEXT: rsbs.w r1, lr, #0 +; CHECK-NEXT: sbcs.w r1, r3, r12 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vand q0, q1, q0 +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI7_0: Index: llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll @@ -318,11 +318,9 @@ define arm_aapcs_vfpcc void @store_v2i1(<2 x i1> *%dst, <2 x i64> %a) { ; CHECK-LE-LABEL: store_v2i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov r1, s1 -; CHECK-LE-NEXT: vmov r2, s0 -; CHECK-LE-NEXT: vmov r3, s2 +; CHECK-LE-NEXT: vmov r1, r2, d0 ; CHECK-LE-NEXT: orrs r1, r2 -; CHECK-LE-NEXT: vmov r2, s3 +; CHECK-LE-NEXT: vmov r2, r3, d1 ; CHECK-LE-NEXT: cset r1, eq ; CHECK-LE-NEXT: orrs r2, r3 ; CHECK-LE-NEXT: cset r2, eq @@ -337,11 +335,9 @@ ; CHECK-BE-LABEL: store_v2i1: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov r1, s6 -; CHECK-BE-NEXT: vmov r2, s7 -; CHECK-BE-NEXT: vmov r3, s5 +; CHECK-BE-NEXT: vmov r1, r2, d3 ; CHECK-BE-NEXT: orrs r1, r2 -; CHECK-BE-NEXT: vmov r2, s4 +; CHECK-BE-NEXT: vmov r2, r3, d2 ; CHECK-BE-NEXT: cset r1, eq ; CHECK-BE-NEXT: orrs r2, r3 ; CHECK-BE-NEXT: cset r2, eq Index: llvm/test/CodeGen/Thumb2/mve-pred-not.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-not.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-not.ll @@ -323,11 +323,9 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: cmpeqz_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -351,11 +349,9 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: cmpeq_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne Index: llvm/test/CodeGen/Thumb2/mve-pred-or.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-or.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-or.ll @@ -377,25 +377,21 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: cmpeqz_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r1, r2, d2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -421,33 +417,27 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: cmpeq_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: eors r1, r3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r12, r2, d4 +; CHECK-NEXT: vmov r3, r1, d2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: eors r2, r3 +; CHECK-NEXT: eor.w r2, r3, r12 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne Index: llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll @@ -459,27 +459,23 @@ ; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vcmp.i32 eq, q0, zr ; CHECK-NEXT: vpsel q3, q2, q1 -; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov r0, r1, d6 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.16 q0[1], r1 +; CHECK-NEXT: vmov r0, r1, d7 ; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmov.16 q0[3], r1 ; CHECK-NEXT: vcmp.i32 eq, q3, zr ; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[7], r0 ; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vmov.16 q0[7], r1 ; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: add r0, sp, #16 Index: llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll @@ -72,40 +72,34 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r1, r2, d2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 ; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vbic q3, q3, q2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -202,40 +196,34 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: cset r1, ne ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r1, r2, d2 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: cset r1, ne ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 ; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: vbic q3, q3, q2 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -441,25 +429,21 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r2, r3, d3 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: csetm r12, ne ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: csetm r4, ne ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: csetm lr, ne ; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: cset r2, eq Index: llvm/test/CodeGen/Thumb2/mve-pred-xor.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-xor.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-xor.ll @@ -457,25 +457,21 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: cmpeqz_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r1, r2, d2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -501,33 +497,27 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeq_v2i1(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: cmpeq_v2i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: eors r1, r3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r12, r2, d4 +; CHECK-NEXT: vmov r3, r1, d2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: eors r2, r3 +; CHECK-NEXT: eor.w r2, r3, r12 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne Index: llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -68,20 +68,18 @@ ; CHECK-NEXT: vbic q3, q0, q2 ; CHECK-NEXT: vand q2, q4, q2 ; CHECK-NEXT: vorr q2, q2, q3 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: subs r4, r4, r6 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: subs r3, r3, r6 +; CHECK-NEXT: sbcs r3, r4, #0 +; CHECK-NEXT: vmov r4, r5, d5 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: subs r5, r5, r6 ; CHECK-NEXT: vmov.32 q3[1], r3 -; CHECK-NEXT: sbcs r4, r4, #0 +; CHECK-NEXT: subs r4, r4, r6 +; CHECK-NEXT: sbcs r4, r5, #0 ; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r4, #1 @@ -260,19 +258,17 @@ ; CHECK-NEXT: vmov.f32 s22, s15 ; CHECK-NEXT: vmullb.s32 q6, q5, q4 ; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vmov r7, s27 -; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vmov r4, r7, d13 ; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: vmov r10, s24 -; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 ; CHECK-NEXT: vmov.f32 s10, s9 -; CHECK-NEXT: sbcs.w r5, r2, r7 +; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 ; CHECK-NEXT: vmov r6, s12 +; CHECK-NEXT: sbcs.w r5, r2, r7 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r5, #1 ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: vmov r5, s25 +; CHECK-NEXT: vmov r10, r5, d12 ; CHECK-NEXT: csetm r8, ne ; CHECK-NEXT: asrl r10, r5, #31 ; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 @@ -290,21 +286,19 @@ ; CHECK-NEXT: vbic q5, q0, q4 ; CHECK-NEXT: vand q4, q6, q4 ; CHECK-NEXT: vorr q4, q4, q5 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: subs.w r4, r4, r8 -; CHECK-NEXT: vmov r4, s19 -; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: vmov r3, r4, d8 +; CHECK-NEXT: subs.w r3, r3, r8 +; CHECK-NEXT: sbcs r3, r4, #0 +; CHECK-NEXT: vmov r4, r5, d9 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: subs.w r5, r5, r8 ; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r4, r5, #0 ; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: sbcs r4, r4, #0 ; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r4, #1 @@ -313,10 +307,10 @@ ; CHECK-NEXT: vmov q5[2], q5[0], r3, r4 ; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: smull r6, r5, r6, r5 ; CHECK-NEXT: vbic q6, q1, q5 ; CHECK-NEXT: vand q4, q4, q5 ; CHECK-NEXT: vorr q4, q4, q6 +; CHECK-NEXT: smull r6, r5, r6, r5 ; CHECK-NEXT: asrl r6, r5, #31 ; CHECK-NEXT: smull r4, r7, r4, r3 ; CHECK-NEXT: asrl r4, r7, #31 @@ -342,20 +336,18 @@ ; CHECK-NEXT: vbic q3, q0, q2 ; CHECK-NEXT: vand q2, q5, q2 ; CHECK-NEXT: vorr q2, q2, q3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: subs.w r3, r3, r8 -; CHECK-NEXT: sbcs r3, r4, #0 -; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov r4, r3, d4 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r3, r3, #0 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vmov r3, r4, d5 ; CHECK-NEXT: csetm r5, ne ; CHECK-NEXT: vmov.32 q3[1], r5 -; CHECK-NEXT: subs.w r4, r4, r8 -; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: subs.w r3, r3, r8 +; CHECK-NEXT: sbcs r3, r4, #0 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 @@ -538,18 +530,16 @@ ; CHECK-NEXT: vmov.f32 s30, s23 ; CHECK-NEXT: vmullb.s32 q0, q7, q6 ; CHECK-NEXT: vmov.f32 s18, s17 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vmov r6, r5, d1 ; CHECK-NEXT: asrl r6, r5, #31 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: rsbs.w r7, r6, #-2147483648 ; CHECK-NEXT: vmov.f32 s22, s21 +; CHECK-NEXT: rsbs.w r7, r6, #-2147483648 ; CHECK-NEXT: sbcs.w r7, r12, r5 ; CHECK-NEXT: mov.w r7, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r7, #1 ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: vmov r7, s1 +; CHECK-NEXT: vmov r4, r7, d0 ; CHECK-NEXT: csetm r10, ne ; CHECK-NEXT: asrl r4, r7, #31 ; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 @@ -567,20 +557,18 @@ ; CHECK-NEXT: vbic q6, q2, q0 ; CHECK-NEXT: vand q0, q7, q0 ; CHECK-NEXT: vorr q6, q0, q6 -; CHECK-NEXT: vmov r4, s24 -; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: vmov r5, s26 -; CHECK-NEXT: subs.w r4, r4, r8 -; CHECK-NEXT: vmov r4, s27 -; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: vmov r3, r4, d12 +; CHECK-NEXT: subs.w r3, r3, r8 +; CHECK-NEXT: sbcs r3, r4, #0 +; CHECK-NEXT: vmov r4, r5, d13 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: subs.w r5, r5, r8 ; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: sbcs r4, r4, #0 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r4, r5, #0 ; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r4, #1 @@ -618,20 +606,18 @@ ; CHECK-NEXT: vbic q4, q2, q0 ; CHECK-NEXT: vand q0, q5, q0 ; CHECK-NEXT: vorr q4, q0, q4 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: subs.w r4, r4, r8 -; CHECK-NEXT: vmov r4, s19 -; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: vmov r3, r4, d8 +; CHECK-NEXT: subs.w r3, r3, r8 +; CHECK-NEXT: sbcs r3, r4, #0 +; CHECK-NEXT: vmov r4, r5, d9 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: subs.w r5, r5, r8 ; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: sbcs r4, r4, #0 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r4, r5, #0 ; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r4, #1 @@ -901,14 +887,12 @@ ; CHECK-NEXT: vmov.f32 s14, s11 ; CHECK-NEXT: vmullb.u32 q4, q3, q1 ; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r5, s17 -; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r4, r5, d8 ; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: subs.w r6, r4, #-1 ; CHECK-NEXT: vmov.f32 s10, s9 +; CHECK-NEXT: subs.w r6, r4, #-1 ; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: vmov r6, s18 +; CHECK-NEXT: vmov r6, r7, d9 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: lsrl r6, r7, #31 ; CHECK-NEXT: it lo @@ -928,13 +912,11 @@ ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vorn q1, q3, q1 ; CHECK-NEXT: vmullb.u32 q3, q2, q0 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov r4, r5, d6 ; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: vmov r7, s15 ; CHECK-NEXT: subs.w r6, r4, #-1 ; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: vmov r6, s14 +; CHECK-NEXT: vmov r6, r7, d7 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: lsrl r6, r7, #31 ; CHECK-NEXT: it lo @@ -1566,23 +1548,19 @@ ; CHECK-NEXT: vcmp.u32 cs, q1, q5 ; CHECK-NEXT: vpsel q7, q3, q2 ; CHECK-NEXT: vcmp.u32 cs, q1, q6 -; CHECK-NEXT: vmov r4, s28 +; CHECK-NEXT: vmov r4, r12, d14 ; CHECK-NEXT: vpsel q6, q3, q2 ; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vmov r4, s29 -; CHECK-NEXT: vmov.16 q5[1], r4 -; CHECK-NEXT: vmov r4, s30 +; CHECK-NEXT: vmov.16 q5[1], r12 +; CHECK-NEXT: vmov r4, r12, d15 ; CHECK-NEXT: vmov.16 q5[2], r4 -; CHECK-NEXT: vmov r4, s31 -; CHECK-NEXT: vmov.16 q5[3], r4 -; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vmov.16 q5[3], r12 +; CHECK-NEXT: vmov r4, r12, d12 ; CHECK-NEXT: vmov.16 q5[4], r4 -; CHECK-NEXT: vmov r4, s25 -; CHECK-NEXT: vmov.16 q5[5], r4 -; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vmov.16 q5[5], r12 +; CHECK-NEXT: vmov r4, r12, d13 ; CHECK-NEXT: vmov.16 q5[6], r4 -; CHECK-NEXT: vmov r4, s27 -; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: vmov.16 q5[7], r12 ; CHECK-NEXT: vptt.i16 ne, q5, zr ; CHECK-NEXT: vldrht.u16 q5, [r0], #16 ; CHECK-NEXT: vldrht.u16 q6, [r1], #16 @@ -1685,23 +1663,19 @@ ; CHECK-NEXT: vcmp.u32 cs, q1, q5 ; CHECK-NEXT: vpsel q7, q3, q2 ; CHECK-NEXT: vcmp.u32 cs, q1, q6 -; CHECK-NEXT: vmov r4, s28 +; CHECK-NEXT: vmov r4, r12, d14 ; CHECK-NEXT: vpsel q6, q3, q2 ; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vmov r4, s29 -; CHECK-NEXT: vmov.16 q5[1], r4 -; CHECK-NEXT: vmov r4, s30 +; CHECK-NEXT: vmov.16 q5[1], r12 +; CHECK-NEXT: vmov r4, r12, d15 ; CHECK-NEXT: vmov.16 q5[2], r4 -; CHECK-NEXT: vmov r4, s31 -; CHECK-NEXT: vmov.16 q5[3], r4 -; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vmov.16 q5[3], r12 +; CHECK-NEXT: vmov r4, r12, d12 ; CHECK-NEXT: vmov.16 q5[4], r4 -; CHECK-NEXT: vmov r4, s25 -; CHECK-NEXT: vmov.16 q5[5], r4 -; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vmov.16 q5[5], r12 +; CHECK-NEXT: vmov r4, r12, d13 ; CHECK-NEXT: vmov.16 q5[6], r4 -; CHECK-NEXT: vmov r4, s27 -; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: vmov.16 q5[7], r12 ; CHECK-NEXT: vptt.i16 ne, q5, zr ; CHECK-NEXT: vldrht.u16 q5, [r0], #16 ; CHECK-NEXT: vldrht.u16 q6, [r1], #16 @@ -2598,23 +2572,19 @@ ; CHECK-NEXT: vcmp.u32 cs, q1, q5 ; CHECK-NEXT: vpsel q7, q3, q2 ; CHECK-NEXT: vcmp.u32 cs, q1, q6 -; CHECK-NEXT: vmov r4, s28 +; CHECK-NEXT: vmov r4, r12, d14 ; CHECK-NEXT: vpsel q6, q3, q2 ; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vmov r4, s29 -; CHECK-NEXT: vmov.16 q5[1], r4 -; CHECK-NEXT: vmov r4, s30 +; CHECK-NEXT: vmov.16 q5[1], r12 +; CHECK-NEXT: vmov r4, r12, d15 ; CHECK-NEXT: vmov.16 q5[2], r4 -; CHECK-NEXT: vmov r4, s31 -; CHECK-NEXT: vmov.16 q5[3], r4 -; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vmov.16 q5[3], r12 +; CHECK-NEXT: vmov r4, r12, d12 ; CHECK-NEXT: vmov.16 q5[4], r4 -; CHECK-NEXT: vmov r4, s25 -; CHECK-NEXT: vmov.16 q5[5], r4 -; CHECK-NEXT: vmov r4, s26 +; CHECK-NEXT: vmov.16 q5[5], r12 +; CHECK-NEXT: vmov r4, r12, d13 ; CHECK-NEXT: vmov.16 q5[6], r4 -; CHECK-NEXT: vmov r4, s27 -; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: vmov.16 q5[7], r12 ; CHECK-NEXT: vptt.i16 ne, q5, zr ; CHECK-NEXT: vldrbt.s16 q5, [r0], #8 ; CHECK-NEXT: vldrbt.s16 q6, [r1], #8 @@ -2685,12 +2655,12 @@ define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) { ; CHECK-LABEL: ssatmul_16t_q7: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #56 -; CHECK-NEXT: sub sp, #56 +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB18_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -2701,14 +2671,14 @@ ; CHECK-NEXT: sub.w r12, r12, #16 ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI18_1 -; CHECK-NEXT: vmov.i8 q2, #0x0 +; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: add.w lr, lr, r12, lsr #4 ; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI18_2 -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 +; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI18_3 @@ -2718,90 +2688,82 @@ ; CHECK-NEXT: .LBB18_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: adds r3, #16 +; CHECK-NEXT: vdup.32 q0, r5 +; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: vorr q4, q0, q4 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 ; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r4, r3, d8 ; CHECK-NEXT: vmov.16 q7[0], r4 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmov.16 q7[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: vmov.16 q7[2], r4 -; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vmov.16 q7[1], r3 +; CHECK-NEXT: vmov r3, r4, d9 ; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q7[2], r3 ; CHECK-NEXT: vmov.16 q7[3], r4 ; CHECK-NEXT: vorr q4, q0, q4 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 ; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov.16 q7[4], r4 -; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov r3, r4, d8 +; CHECK-NEXT: vmov.16 q7[4], r3 ; CHECK-NEXT: vmov.16 q7[5], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: vmov.16 q7[6], r4 -; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vmov r3, r4, d9 +; CHECK-NEXT: vmov.16 q7[6], r3 ; CHECK-NEXT: vmov.16 q7[7], r4 ; CHECK-NEXT: vcmp.i16 ne, q7, zr ; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov.u16 r4, q4[0] -; CHECK-NEXT: vmov.8 q7[0], r4 -; CHECK-NEXT: vmov.u16 r4, q4[1] -; CHECK-NEXT: vmov.8 q7[1], r4 -; CHECK-NEXT: vmov.u16 r4, q4[2] -; CHECK-NEXT: vmov.8 q7[2], r4 -; CHECK-NEXT: vmov.u16 r4, q4[3] -; CHECK-NEXT: vmov.8 q7[3], r4 -; CHECK-NEXT: vmov.u16 r4, q4[4] -; CHECK-NEXT: vmov.8 q7[4], r4 -; CHECK-NEXT: vmov.u16 r4, q4[5] -; CHECK-NEXT: vmov.8 q7[5], r4 -; CHECK-NEXT: vmov.u16 r4, q4[6] -; CHECK-NEXT: vmov.8 q7[6], r4 -; CHECK-NEXT: vmov.u16 r4, q4[7] +; CHECK-NEXT: vmov.u16 r3, q4[0] +; CHECK-NEXT: vmov.8 q7[0], r3 +; CHECK-NEXT: vmov.u16 r3, q4[1] +; CHECK-NEXT: vmov.8 q7[1], r3 +; CHECK-NEXT: vmov.u16 r3, q4[2] +; CHECK-NEXT: vmov.8 q7[2], r3 +; CHECK-NEXT: vmov.u16 r3, q4[3] +; CHECK-NEXT: vmov.8 q7[3], r3 +; CHECK-NEXT: vmov.u16 r3, q4[4] +; CHECK-NEXT: vmov.8 q7[4], r3 +; CHECK-NEXT: vmov.u16 r3, q4[5] +; CHECK-NEXT: vmov.8 q7[5], r3 +; CHECK-NEXT: vmov.u16 r3, q4[6] +; CHECK-NEXT: vmov.8 q7[6], r3 +; CHECK-NEXT: vmov.u16 r3, q4[7] ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q7[7], r4 +; CHECK-NEXT: vmov.8 q7[7], r3 ; CHECK-NEXT: vorr q4, q0, q4 ; CHECK-NEXT: vorr q0, q0, q6 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 ; CHECK-NEXT: vpsel q5, q3, q2 ; CHECK-NEXT: vcmp.u32 cs, q1, q0 -; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: vmov r3, r4, d10 ; CHECK-NEXT: vpsel q0, q3, q2 -; CHECK-NEXT: vmov.16 q4[0], r4 -; CHECK-NEXT: vmov r4, s21 +; CHECK-NEXT: vmov.16 q4[0], r3 ; CHECK-NEXT: vmov.16 q4[1], r4 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: vmov.16 q4[2], r4 -; CHECK-NEXT: vmov r4, s23 +; CHECK-NEXT: vmov r3, r4, d11 +; CHECK-NEXT: vmov.16 q4[2], r3 ; CHECK-NEXT: vmov.16 q4[3], r4 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov.16 q4[4], r4 -; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: vmov.16 q4[4], r3 ; CHECK-NEXT: vmov.16 q4[5], r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov.16 q4[6], r4 -; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov r3, r4, d1 +; CHECK-NEXT: vmov.16 q4[6], r3 ; CHECK-NEXT: vmov.16 q4[7], r4 ; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q0, q3, q2 -; CHECK-NEXT: vmov.u16 r4, q0[0] -; CHECK-NEXT: vmov.8 q7[8], r4 -; CHECK-NEXT: vmov.u16 r4, q0[1] -; CHECK-NEXT: vmov.8 q7[9], r4 -; CHECK-NEXT: vmov.u16 r4, q0[2] -; CHECK-NEXT: vmov.8 q7[10], r4 -; CHECK-NEXT: vmov.u16 r4, q0[3] -; CHECK-NEXT: vmov.8 q7[11], r4 -; CHECK-NEXT: vmov.u16 r4, q0[4] -; CHECK-NEXT: vmov.8 q7[12], r4 -; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: vmov.8 q7[13], r4 -; CHECK-NEXT: vmov.u16 r4, q0[6] -; CHECK-NEXT: vmov.8 q7[14], r4 -; CHECK-NEXT: vmov.u16 r4, q0[7] -; CHECK-NEXT: vmov.8 q7[15], r4 +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: vmov.8 q7[8], r3 +; CHECK-NEXT: vmov.u16 r3, q0[1] +; CHECK-NEXT: vmov.8 q7[9], r3 +; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vmov.8 q7[10], r3 +; CHECK-NEXT: vmov.u16 r3, q0[3] +; CHECK-NEXT: vmov.8 q7[11], r3 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov.8 q7[12], r3 +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: vmov.8 q7[13], r3 +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: vmov.8 q7[14], r3 +; CHECK-NEXT: vmov.u16 r3, q0[7] +; CHECK-NEXT: vmov.8 q7[15], r3 ; CHECK-NEXT: vptt.i8 ne, q7, zr ; CHECK-NEXT: vldrbt.u8 q0, [r0], #16 ; CHECK-NEXT: vldrbt.u8 q4, [r1], #16 @@ -2813,9 +2775,9 @@ ; CHECK-NEXT: vstrbt.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB18_2 ; CHECK-NEXT: .LBB18_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #56 +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI18_0: @@ -2885,12 +2847,12 @@ define arm_aapcs_vfpcc void @ssatmul_16ti_q7(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* noalias nocapture %pDst, i32 %N) { ; CHECK-LABEL: ssatmul_16ti_q7: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #56 -; CHECK-NEXT: sub sp, #56 +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB19_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -2901,14 +2863,14 @@ ; CHECK-NEXT: sub.w r12, r12, #16 ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI19_1 -; CHECK-NEXT: vmov.i8 q2, #0x0 +; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: add.w lr, lr, r12, lsr #4 ; CHECK-NEXT: sub.w r12, r3, #1 ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI19_2 -; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q1, r12 +; CHECK-NEXT: vmov.i8 q2, #0x0 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI19_3 @@ -2918,90 +2880,82 @@ ; CHECK-NEXT: .LBB19_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: adds r3, #16 +; CHECK-NEXT: vdup.32 q0, r5 +; CHECK-NEXT: adds r5, #16 ; CHECK-NEXT: vorr q4, q0, q4 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 ; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r4, r3, d8 ; CHECK-NEXT: vmov.16 q7[0], r4 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmov.16 q7[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: vmov.16 q7[2], r4 -; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vmov.16 q7[1], r3 +; CHECK-NEXT: vmov r3, r4, d9 ; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q7[2], r3 ; CHECK-NEXT: vmov.16 q7[3], r4 ; CHECK-NEXT: vorr q4, q0, q4 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 ; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov.16 q7[4], r4 -; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov r3, r4, d8 +; CHECK-NEXT: vmov.16 q7[4], r3 ; CHECK-NEXT: vmov.16 q7[5], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: vmov.16 q7[6], r4 -; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vmov r3, r4, d9 +; CHECK-NEXT: vmov.16 q7[6], r3 ; CHECK-NEXT: vmov.16 q7[7], r4 ; CHECK-NEXT: vcmp.i16 ne, q7, zr ; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov.u16 r4, q4[0] -; CHECK-NEXT: vmov.8 q7[0], r4 -; CHECK-NEXT: vmov.u16 r4, q4[1] -; CHECK-NEXT: vmov.8 q7[1], r4 -; CHECK-NEXT: vmov.u16 r4, q4[2] -; CHECK-NEXT: vmov.8 q7[2], r4 -; CHECK-NEXT: vmov.u16 r4, q4[3] -; CHECK-NEXT: vmov.8 q7[3], r4 -; CHECK-NEXT: vmov.u16 r4, q4[4] -; CHECK-NEXT: vmov.8 q7[4], r4 -; CHECK-NEXT: vmov.u16 r4, q4[5] -; CHECK-NEXT: vmov.8 q7[5], r4 -; CHECK-NEXT: vmov.u16 r4, q4[6] -; CHECK-NEXT: vmov.8 q7[6], r4 -; CHECK-NEXT: vmov.u16 r4, q4[7] +; CHECK-NEXT: vmov.u16 r3, q4[0] +; CHECK-NEXT: vmov.8 q7[0], r3 +; CHECK-NEXT: vmov.u16 r3, q4[1] +; CHECK-NEXT: vmov.8 q7[1], r3 +; CHECK-NEXT: vmov.u16 r3, q4[2] +; CHECK-NEXT: vmov.8 q7[2], r3 +; CHECK-NEXT: vmov.u16 r3, q4[3] +; CHECK-NEXT: vmov.8 q7[3], r3 +; CHECK-NEXT: vmov.u16 r3, q4[4] +; CHECK-NEXT: vmov.8 q7[4], r3 +; CHECK-NEXT: vmov.u16 r3, q4[5] +; CHECK-NEXT: vmov.8 q7[5], r3 +; CHECK-NEXT: vmov.u16 r3, q4[6] +; CHECK-NEXT: vmov.8 q7[6], r3 +; CHECK-NEXT: vmov.u16 r3, q4[7] ; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q7[7], r4 +; CHECK-NEXT: vmov.8 q7[7], r3 ; CHECK-NEXT: vorr q4, q0, q4 ; CHECK-NEXT: vorr q0, q0, q6 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 ; CHECK-NEXT: vpsel q5, q3, q2 ; CHECK-NEXT: vcmp.u32 cs, q1, q0 -; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: vmov r3, r4, d10 ; CHECK-NEXT: vpsel q0, q3, q2 -; CHECK-NEXT: vmov.16 q4[0], r4 -; CHECK-NEXT: vmov r4, s21 +; CHECK-NEXT: vmov.16 q4[0], r3 ; CHECK-NEXT: vmov.16 q4[1], r4 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: vmov.16 q4[2], r4 -; CHECK-NEXT: vmov r4, s23 +; CHECK-NEXT: vmov r3, r4, d11 +; CHECK-NEXT: vmov.16 q4[2], r3 ; CHECK-NEXT: vmov.16 q4[3], r4 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov.16 q4[4], r4 -; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: vmov.16 q4[4], r3 ; CHECK-NEXT: vmov.16 q4[5], r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov.16 q4[6], r4 -; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov r3, r4, d1 +; CHECK-NEXT: vmov.16 q4[6], r3 ; CHECK-NEXT: vmov.16 q4[7], r4 ; CHECK-NEXT: vcmp.i16 ne, q4, zr ; CHECK-NEXT: vpsel q0, q3, q2 -; CHECK-NEXT: vmov.u16 r4, q0[0] -; CHECK-NEXT: vmov.8 q7[8], r4 -; CHECK-NEXT: vmov.u16 r4, q0[1] -; CHECK-NEXT: vmov.8 q7[9], r4 -; CHECK-NEXT: vmov.u16 r4, q0[2] -; CHECK-NEXT: vmov.8 q7[10], r4 -; CHECK-NEXT: vmov.u16 r4, q0[3] -; CHECK-NEXT: vmov.8 q7[11], r4 -; CHECK-NEXT: vmov.u16 r4, q0[4] -; CHECK-NEXT: vmov.8 q7[12], r4 -; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: vmov.8 q7[13], r4 -; CHECK-NEXT: vmov.u16 r4, q0[6] -; CHECK-NEXT: vmov.8 q7[14], r4 -; CHECK-NEXT: vmov.u16 r4, q0[7] -; CHECK-NEXT: vmov.8 q7[15], r4 +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: vmov.8 q7[8], r3 +; CHECK-NEXT: vmov.u16 r3, q0[1] +; CHECK-NEXT: vmov.8 q7[9], r3 +; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vmov.8 q7[10], r3 +; CHECK-NEXT: vmov.u16 r3, q0[3] +; CHECK-NEXT: vmov.8 q7[11], r3 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov.8 q7[12], r3 +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: vmov.8 q7[13], r3 +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: vmov.8 q7[14], r3 +; CHECK-NEXT: vmov.u16 r3, q0[7] +; CHECK-NEXT: vmov.8 q7[15], r3 ; CHECK-NEXT: vptt.i8 ne, q7, zr ; CHECK-NEXT: vldrbt.u8 q0, [r0], #16 ; CHECK-NEXT: vldrbt.u8 q4, [r1], #16 @@ -3013,9 +2967,9 @@ ; CHECK-NEXT: vstrbt.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB19_2 ; CHECK-NEXT: .LBB19_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #56 +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI19_0: Index: llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll +++ llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll @@ -36,48 +36,44 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: eor.w r12, r1, r0 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: eors r1, r0 +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r2, r2, r0 +; CHECK-NEXT: eor.w r12, r3, r1 +; CHECK-NEXT: adc.w r0, r3, r1 +; CHECK-NEXT: eor.w r1, r3, r0 +; CHECK-NEXT: vmov r3, r4, d0 ; CHECK-NEXT: bic.w r1, r1, r12 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov lr, r1, d2 ; CHECK-NEXT: cset r12, mi ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: asrne r2, r0, #31 -; CHECK-NEXT: adds r4, r4, r5 -; CHECK-NEXT: mvn r5, #-2147483648 -; CHECK-NEXT: eor.w lr, r1, r3 -; CHECK-NEXT: adcs r3, r1 -; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: bic.w r1, r1, lr -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: cset r1, mi -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: eor.w r5, r4, r1 +; CHECK-NEXT: adcs r1, r4 +; CHECK-NEXT: eors r4, r1 +; CHECK-NEXT: bic.w r5, r4, r5 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: cset r5, mi +; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: asrne r4, r3, #31 +; CHECK-NEXT: asrne r3, r1, #31 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: cset r2, mi +; CHECK-NEXT: mvn r3, #-2147483648 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cinv r2, r5, eq +; CHECK-NEXT: cinv r2, r3, eq ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: csel r0, r2, r0, ne -; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: cset r2, mi ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cinv r2, r5, eq -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r2, r3, ne +; CHECK-NEXT: cinv r2, r3, eq +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: csel r1, r2, r1, ne ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -120,33 +116,29 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r2, d2 ; CHECK-NEXT: adcs lr, r12, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r2, #-1 -; CHECK-NEXT: adds r4, r4, r5 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: adcs r3, r12, #0 +; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r2, r5 +; CHECK-NEXT: adcs r5, r12, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 +; CHECK-NEXT: movne.w r3, #-1 ; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r1, #-1 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r2, #-1 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) @@ -189,46 +181,42 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: subs r2, r2, r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: eor.w r12, r1, r0 -; CHECK-NEXT: sbc.w r0, r1, r0 -; CHECK-NEXT: eors r1, r0 -; CHECK-NEXT: ands.w r1, r1, r12 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: eor.w r12, r3, r1 +; CHECK-NEXT: sbc.w r1, r3, r1 +; CHECK-NEXT: eor.w r2, r3, r1 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: ands.w r2, r2, r12 +; CHECK-NEXT: vmov lr, r2, d2 ; CHECK-NEXT: cset r12, mi ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: asrne r2, r0, #31 -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: mvn r5, #-2147483648 -; CHECK-NEXT: eor.w lr, r1, r3 -; CHECK-NEXT: sbc.w r3, r1, r3 -; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: ands.w r1, r1, lr -; CHECK-NEXT: cset r1, mi -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: asrne r0, r1, #31 +; CHECK-NEXT: subs.w r3, r3, lr +; CHECK-NEXT: eor.w r5, r4, r2 +; CHECK-NEXT: sbc.w r2, r4, r2 +; CHECK-NEXT: eors r4, r2 +; CHECK-NEXT: ands r5, r4 +; CHECK-NEXT: cset r5, mi +; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: asrne r4, r3, #31 +; CHECK-NEXT: asrne r3, r2, #31 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-NEXT: cset r0, mi +; CHECK-NEXT: mvn r3, #-2147483648 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 -; CHECK-NEXT: cset r2, mi -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cinv r2, r5, eq +; CHECK-NEXT: cinv r0, r3, eq ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: csel r0, r2, r0, ne -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: cset r2, mi +; CHECK-NEXT: csel r0, r0, r1, ne ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: cinv r2, r5, eq +; CHECK-NEXT: cset r1, mi ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: csel r1, r2, r3, ne +; CHECK-NEXT: cinv r1, r3, eq +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: csel r1, r1, r2, ne ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -271,35 +259,31 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: sbcs.w r0, r1, r0 -; CHECK-NEXT: adc r1, r12, #0 -; CHECK-NEXT: rsbs.w lr, r1, #1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs.w r1, r3, r1 +; CHECK-NEXT: adc r2, r12, #0 +; CHECK-NEXT: rsbs.w lr, r2, #1 +; CHECK-NEXT: vmov r3, r2, d2 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r2, #0 -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: sbcs r1, r3 -; CHECK-NEXT: adc r3, r12, #0 -; CHECK-NEXT: rsbs.w r3, r3, #1 +; CHECK-NEXT: movne r0, #0 +; CHECK-NEXT: subs r3, r4, r3 +; CHECK-NEXT: sbcs.w r2, r5, r2 +; CHECK-NEXT: adc r5, r12, #0 +; CHECK-NEXT: rsbs.w r5, r5, #1 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: movne r3, #0 ; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne r0, #0 -; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r1, #0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r2, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %src1, <2 x i64> %src2) Index: llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -21,41 +21,36 @@ define arm_aapcs_vfpcc void @scatter_inc_mini_8i16(<8 x i16> %data, i16* %dst, <8 x i32> %offs) { ; CHECK-LABEL: scatter_inc_mini_8i16: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vmov.i32 q3, #0x10 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vadd.i32 q4, q1, q3 +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vadd.i32 q1, q1, q3 +; CHECK-NEXT: vmov r1, r2, d2 +; CHECK-NEXT: vmov r3, r12, d3 ; CHECK-NEXT: vshl.i32 q1, q2, #1 -; CHECK-NEXT: vmov r1, s16 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s17 ; CHECK-NEXT: vadd.i32 q1, q1, q3 -; CHECK-NEXT: strh r2, [r1] +; CHECK-NEXT: vmov r0, lr, d2 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: strh r6, [r1] ; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: strh r1, [r2] ; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: strh r1, [r3] ; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strh.w r1, [r12] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: strh.w r0, [lr] +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: strh r0, [r4] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: strh r0, [r5] +; CHECK-NEXT: pop {r4, r5, r6, pc} %1 = add <8 x i32> %offs, %2 = getelementptr inbounds i16, i16* %dst, <8 x i32> %1 call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %data, <8 x i16*> %2, i32 4, <8 x i1> ) @@ -65,69 +60,66 @@ define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, i8* %dst, <16 x i32> %offs) { ; CHECK-LABEL: scatter_inc_mini_16i8: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.i32 q5, #0x10 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.i32 q4, #0x10 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q4, q1, q5 -; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: add r5, sp, #48 +; CHECK-NEXT: vmov r1, r2, d2 ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q3, q3, q5 -; CHECK-NEXT: vadd.i32 q2, q2, q5 -; CHECK-NEXT: strb r2, [r1] -; CHECK-NEXT: add r1, sp, #32 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov r3, r12, d3 +; CHECK-NEXT: vadd.i32 q1, q2, r0 +; CHECK-NEXT: vadd.i32 q2, q1, q4 +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: vmov lr, r7, d4 +; CHECK-NEXT: vmov.u8 r6, q0[0] ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vadd.i32 q1, q1, q5 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov r0, r8, d5 +; CHECK-NEXT: vadd.i32 q2, q3, q4 +; CHECK-NEXT: vmov.u8 r4, q0[4] +; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: vmov.u8 r5, q0[6] +; CHECK-NEXT: strb r6, [r1] +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: strb r1, [r2] +; CHECK-NEXT: vmov.u8 r6, q0[2] +; CHECK-NEXT: vmov r1, r9, d4 +; CHECK-NEXT: strb r6, [r3] +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: strb.w r3, [r12] +; CHECK-NEXT: vmov r3, r6, d5 +; CHECK-NEXT: strb.w r4, [lr] +; CHECK-NEXT: vmov.u8 r4, q0[5] +; CHECK-NEXT: strb r4, [r7] +; CHECK-NEXT: vmov r7, r4, d2 +; CHECK-NEXT: strb r5, [r0] +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: strb.w r0, [r8] +; CHECK-NEXT: vmov r0, r5, d3 +; CHECK-NEXT: strb r2, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: strb.w r1, [r9] ; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: strb r1, [r3] ; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strb r1, [r6] ; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: strb r1, [r7] ; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: strb r1, [r4] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: strb r0, [r5] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} %1 = add <16 x i32> %offs, %2 = getelementptr inbounds i8, i8* %dst, <16 x i32> %1 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %data, <16 x i8*> %2, i32 2, <16 x i1> ) Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -51,37 +51,35 @@ define arm_aapcs_vfpcc void @scaled_v8i16_sext(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: scaled_v8i16_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q2, [r1] +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov r12, lr, d3 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vshl.i32 q2, q2, #1 ; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: strh r6, [r2] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: strh r2, [r3] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: strh.w r2, [r12] +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: strh.w r2, [lr] +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: strh r2, [r0] +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: strh r0, [r1] +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: strh r0, [r4] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: strh r0, [r5] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.sext = sext <8 x i16> %offs to <8 x i32> @@ -99,27 +97,23 @@ ; CHECK-NEXT: vshl.i32 q2, q1, #1 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vstr.16 s12, [r2] +; CHECK-NEXT: vmov r1, r2, d5 +; CHECK-NEXT: vmovx.f16 s8, s1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vstr.16 s12, [r1] -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vstr.16 s8, [r1] -; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vstr.16 s8, [r2] +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vstr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vstr.16 s2, [r0] +; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vstr.16 s3, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 @@ -180,40 +174,38 @@ define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: scaled_v8i16_i16_2gep: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vldrh.s32 q3, [r1] -; CHECK-NEXT: vmov.i32 q2, #0x28 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vshl.i32 q3, q3, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrh.s32 q2, [r1] +; CHECK-NEXT: vmov.i32 q1, #0x28 +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q2, q2, q1 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov r12, lr, d5 +; CHECK-NEXT: vldrh.s32 q2, [r1, #8] +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: strh r6, [r2] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: strh r2, [r3] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: strh.w r2, [r12] +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: strh.w r2, [lr] +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: strh r2, [r0] +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: strh r0, [r1] +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: strh r0, [r4] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: strh r0, [r5] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll @@ -69,35 +69,33 @@ define arm_aapcs_vfpcc void @unscaled_v8i16_sext(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: unscaled_v8i16_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q2, [r1] +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov r12, lr, d3 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: strh r6, [r2] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: strh r2, [r3] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: strh.w r2, [r12] +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: strh.w r2, [lr] +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: strh r2, [r0] +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: strh r0, [r1] +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: strh r0, [r4] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: strh r0, [r5] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.sext = sext <8 x i16> %offs to <8 x i32> @@ -116,25 +114,21 @@ ; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vstr.16 s12, [r1] -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vstr.16 s12, [r2] +; CHECK-NEXT: vmov r1, r2, d5 ; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vstr.16 s8, [r1] -; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vstr.16 s1, [r1] +; CHECK-NEXT: vstr.16 s8, [r2] +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vstr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vstr.16 s3, [r0] -; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vstr.16 s2, [r0] +; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vstr.16 s3, [r0] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 @@ -149,35 +143,33 @@ define arm_aapcs_vfpcc void @unscaled_v8i16_noext(i8* %base, <8 x i32>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: unscaled_v8i16_noext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov r12, lr, d3 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: strh r6, [r2] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: strh r2, [r3] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: strh.w r2, [r12] +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: strh.w r2, [lr] +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: strh r2, [r0] +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: strh r0, [r1] +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: strh r0, [r4] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: strh r0, [r5] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i32>, <8 x i32>* %offptr, align 4 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs @@ -195,25 +187,21 @@ ; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vstr.16 s12, [r1] -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vstr.16 s12, [r2] +; CHECK-NEXT: vmov r1, r2, d5 ; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vstr.16 s8, [r1] -; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vstr.16 s1, [r1] +; CHECK-NEXT: vstr.16 s8, [r2] +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vstr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vstr.16 s3, [r0] -; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vstr.16 s2, [r0] +; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vstr.16 s3, [r0] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i32>, <8 x i32>* %offptr, align 4 @@ -259,38 +247,36 @@ define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) { ; CHECK-LABEL: trunc_signed_unscaled_i64_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.s32 q5, [r1] +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrb.s32 q4, [r1] +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: vmov r12, lr, d9 ; CHECK-NEXT: vldrb.s32 q4, [r1, #4] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q5, q5, r0 ; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: strh r4, [r2] +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r4, r5, d9 +; CHECK-NEXT: strh r2, [r3] +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: strh.w r2, [r12] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: strh.w r2, [lr] +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: strh r2, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: strh r0, [r1] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: strh r0, [r4] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: strh r0, [r5] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.sext = sext <8 x i8> %offs to <8 x i32> @@ -341,35 +327,29 @@ define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) { ; CHECK-LABEL: trunc_signed_unscaled_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q3, [r1] +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrb.s32 q2, [r1] +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov r12, lr, d5 ; CHECK-NEXT: vldrb.s32 q2, [r1, #4] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q3, q3, r0 ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r0, r6, d1 +; CHECK-NEXT: strh r4, [r2] +; CHECK-NEXT: vmov r2, r7, d4 +; CHECK-NEXT: strh r5, [r3] +; CHECK-NEXT: vmov r3, r5, d5 +; CHECK-NEXT: strh.w r0, [r12] +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: strh.w r6, [lr] +; CHECK-NEXT: vmov r6, r4, d3 +; CHECK-NEXT: strh r0, [r2] +; CHECK-NEXT: strh r1, [r7] +; CHECK-NEXT: strh r6, [r3] +; CHECK-NEXT: strh r4, [r5] +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.sext = sext <8 x i8> %offs to <8 x i32> @@ -408,35 +388,33 @@ define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: trunc_signed_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q2, [r1] +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov r12, lr, d3 ; CHECK-NEXT: vldrb.s32 q1, [r1, #4] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: strb r6, [r2] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: strb r2, [r3] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: strb.w r2, [r12] +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: strb.w r2, [lr] +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: strb r2, [r0] +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: strb r0, [r1] +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: strb r0, [r4] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: strb r0, [r5] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.sext = sext <8 x i8> %offs to <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll @@ -236,24 +236,22 @@ define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { ; CHECK-LABEL: ext_scaled_i16_i32_2gep: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: vmov.i32 q1, #0xa -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vmov r4, r5, d1 ; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r0, r12, d2 +; CHECK-NEXT: vmov r2, lr, d3 ; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: strh.w r3, [r12] +; CHECK-NEXT: strh r4, [r2] +; CHECK-NEXT: strh.w r5, [lr] +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll @@ -361,21 +361,19 @@ define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) { ; CHECK-LABEL: trunc_signed_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrb.s32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r4, r5, d1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r0, r12, d2 +; CHECK-NEXT: vmov r2, lr, d3 ; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: strb.w r3, [r12] +; CHECK-NEXT: strb r4, [r2] +; CHECK-NEXT: strb.w r5, [lr] +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.sext = sext <4 x i8> %offs to <4 x i32> @@ -388,21 +386,19 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r4, r5, d1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r0, r12, d2 +; CHECK-NEXT: vmov r2, lr, d3 ; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: strb.w r3, [r12] +; CHECK-NEXT: strb r4, [r2] +; CHECK-NEXT: strb.w r5, [lr] +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.zext = zext <4 x i8> %offs to <4 x i32> Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -20,35 +20,33 @@ define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) { ; CHECK-LABEL: unscaled_v8i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q2, [r1] +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov r12, lr, d3 ; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: strb r6, [r2] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: strb r2, [r3] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: strb.w r2, [r12] +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: strb.w r2, [lr] +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: strb r2, [r0] +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: strb r0, [r1] +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: strb r0, [r4] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: strb r0, [r5] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> @@ -85,66 +83,57 @@ define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { ; CHECK-LABEL: unscaled_v16i8_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrb.s32 q4, [r1] -; CHECK-NEXT: vldrb.s32 q1, [r1, #12] -; CHECK-NEXT: vldrb.s32 q2, [r1, #8] -; CHECK-NEXT: vldrb.s32 q3, [r1, #4] -; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q3, [r1, #8] +; CHECK-NEXT: vmov.u8 r6, q0[0] +; CHECK-NEXT: vmov.u8 r5, q0[4] ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov.u8 r7, q0[6] +; CHECK-NEXT: vmov r12, lr, d3 +; CHECK-NEXT: vldrb.s32 q1, [r1, #4] +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vldrb.s32 q1, [r1, #12] +; CHECK-NEXT: vmov r4, r8, d4 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, r9, d5 +; CHECK-NEXT: strb r6, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: strb r2, [r3] +; CHECK-NEXT: vmov.u8 r6, q0[2] +; CHECK-NEXT: vmov r2, r10, d6 +; CHECK-NEXT: strb.w r6, [r12] +; CHECK-NEXT: vmov.u8 r6, q0[3] +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: strb.w r6, [lr] +; CHECK-NEXT: vmov r6, r1, d7 +; CHECK-NEXT: strb r5, [r4] +; CHECK-NEXT: vmov.u8 r5, q0[5] +; CHECK-NEXT: strb.w r5, [r8] +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: strb.w r0, [r9] +; CHECK-NEXT: vmov r0, r7, d3 +; CHECK-NEXT: strb r3, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: strb.w r2, [r10] +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: strb r2, [r6] +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: strb r2, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: strb r1, [r5] ; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: strb r1, [r4] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: strb r0, [r7] +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 %offs.sext = sext <16 x i8> %offs to <16 x i32> @@ -157,66 +146,57 @@ define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) { ; CHECK-LABEL: unscaled_v16i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrh.s32 q4, [r1] -; CHECK-NEXT: vldrh.s32 q1, [r1, #24] -; CHECK-NEXT: vldrh.s32 q2, [r1, #16] -; CHECK-NEXT: vldrh.s32 q3, [r1, #8] -; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q3, [r1, #16] +; CHECK-NEXT: vmov.u8 r6, q0[0] +; CHECK-NEXT: vmov.u8 r5, q0[4] ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov.u8 r7, q0[6] +; CHECK-NEXT: vmov r12, lr, d3 +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vldrh.s32 q1, [r1, #24] +; CHECK-NEXT: vmov r4, r8, d4 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, r9, d5 +; CHECK-NEXT: strb r6, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: strb r2, [r3] +; CHECK-NEXT: vmov.u8 r6, q0[2] +; CHECK-NEXT: vmov r2, r10, d6 +; CHECK-NEXT: strb.w r6, [r12] +; CHECK-NEXT: vmov.u8 r6, q0[3] +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: strb.w r6, [lr] +; CHECK-NEXT: vmov r6, r1, d7 +; CHECK-NEXT: strb r5, [r4] +; CHECK-NEXT: vmov.u8 r5, q0[5] +; CHECK-NEXT: strb.w r5, [r8] +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: strb.w r0, [r9] +; CHECK-NEXT: vmov r0, r7, d3 +; CHECK-NEXT: strb r3, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: strb.w r2, [r10] +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: strb r2, [r6] +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: strb r2, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: strb r1, [r5] ; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: strb r1, [r4] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: strb r0, [r7] +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %offs = load <16 x i16>, <16 x i16>* %offptr, align 2 %offs.sext = sext <16 x i16> %offs to <16 x i32> @@ -229,70 +209,61 @@ define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) { ; CHECK-LABEL: unscaled_v16i8_scaled: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrb.u32 q4, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #12] -; CHECK-NEXT: vldrb.u32 q2, [r1, #8] -; CHECK-NEXT: vldrb.u32 q3, [r1, #4] -; CHECK-NEXT: vshl.i32 q4, q4, #2 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q3, [r1, #8] +; CHECK-NEXT: vmov.u8 r6, q0[0] +; CHECK-NEXT: vmov.u8 r7, q0[4] ; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vshl.i32 q2, q2, #2 ; CHECK-NEXT: vshl.i32 q3, q3, #2 -; CHECK-NEXT: vadd.i32 q4, q4, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov r12, lr, d3 +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vshl.i32 q1, q1, #2 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vldrb.u32 q1, [r1, #12] +; CHECK-NEXT: vmov r4, r8, d4 ; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vshl.i32 q1, q1, #2 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, r9, d5 +; CHECK-NEXT: strb r6, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: strb r2, [r3] +; CHECK-NEXT: vmov.u8 r6, q0[2] +; CHECK-NEXT: vmov r2, r10, d6 +; CHECK-NEXT: strb.w r6, [r12] +; CHECK-NEXT: vmov.u8 r6, q0[3] +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: strb.w r6, [lr] +; CHECK-NEXT: vmov r6, r5, d7 +; CHECK-NEXT: strb r7, [r4] +; CHECK-NEXT: vmov.u8 r7, q0[5] +; CHECK-NEXT: strb.w r7, [r8] +; CHECK-NEXT: vmov r7, r4, d2 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: strb.w r0, [r9] +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: strb r3, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: strb.w r2, [r10] +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: strb r2, [r6] +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: strb r2, [r5] +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: strb r2, [r7] +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: strb r2, [r4] +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: strb r2, [r0] +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: strb r0, [r1] +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 4 %offs.zext = zext <16 x i8> %offs to <16 x i32> @@ -306,66 +277,57 @@ define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) { ; CHECK-LABEL: unscaled_v16i8_i8_next: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q4, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r1, #48] -; CHECK-NEXT: vldrw.u32 q2, [r1, #32] -; CHECK-NEXT: vldrw.u32 q3, [r1, #16] -; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q3, [r1, #32] +; CHECK-NEXT: vmov.u8 r6, q0[0] +; CHECK-NEXT: vmov.u8 r5, q0[4] ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov.u8 r7, q0[6] +; CHECK-NEXT: vmov r12, lr, d3 +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vldrw.u32 q1, [r1, #48] +; CHECK-NEXT: vmov r4, r8, d4 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, r9, d5 +; CHECK-NEXT: strb r6, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: strb r2, [r3] +; CHECK-NEXT: vmov.u8 r6, q0[2] +; CHECK-NEXT: vmov r2, r10, d6 +; CHECK-NEXT: strb.w r6, [r12] +; CHECK-NEXT: vmov.u8 r6, q0[3] +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: strb.w r6, [lr] +; CHECK-NEXT: vmov r6, r1, d7 +; CHECK-NEXT: strb r5, [r4] +; CHECK-NEXT: vmov.u8 r5, q0[5] +; CHECK-NEXT: strb.w r5, [r8] +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: strb.w r0, [r9] +; CHECK-NEXT: vmov r0, r7, d3 +; CHECK-NEXT: strb r3, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: strb.w r2, [r10] +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: strb r2, [r6] +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: strb r2, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: strb r1, [r5] ; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: strb r1, [r4] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: strb r0, [r7] +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %offs = load <16 x i32>, <16 x i32>* %offptr, align 4 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs @@ -485,71 +447,65 @@ define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { ; CHECK-LABEL: unscaled_v16i8_i8_2gep: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.s32 q1, [r1, #12] -; CHECK-NEXT: vldrb.s32 q2, [r1, #8] -; CHECK-NEXT: vldrb.s32 q3, [r1, #4] -; CHECK-NEXT: vldrb.s32 q5, [r1] -; CHECK-NEXT: vmov.i32 q4, #0x5 -; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrb.s32 q2, [r1] +; CHECK-NEXT: vmov.i32 q1, #0x5 +; CHECK-NEXT: vldrb.s32 q4, [r1, #8] +; CHECK-NEXT: vmov.u8 r6, q0[0] ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vadd.i32 q5, q5, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q4 -; CHECK-NEXT: vadd.i32 q2, q2, q4 -; CHECK-NEXT: vadd.i32 q3, q3, q4 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u8 r5, q0[4] +; CHECK-NEXT: vadd.i32 q2, q2, q1 +; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov.u8 r7, q0[6] +; CHECK-NEXT: vmov r12, lr, d5 +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q3, q2, q1 +; CHECK-NEXT: vldrb.s32 q2, [r1, #12] +; CHECK-NEXT: vmov r4, r8, d6 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, r9, d7 +; CHECK-NEXT: vadd.i32 q3, q4, q1 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: strb r6, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: strb r2, [r3] +; CHECK-NEXT: vmov.u8 r6, q0[2] +; CHECK-NEXT: vmov r2, r10, d6 +; CHECK-NEXT: strb.w r6, [r12] +; CHECK-NEXT: vmov.u8 r6, q0[3] +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: strb.w r6, [lr] +; CHECK-NEXT: vmov r6, r1, d7 +; CHECK-NEXT: strb r5, [r4] +; CHECK-NEXT: vmov.u8 r5, q0[5] +; CHECK-NEXT: strb.w r5, [r8] +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: strb.w r0, [r9] +; CHECK-NEXT: vmov r0, r7, d3 +; CHECK-NEXT: strb r3, [r2] +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: strb.w r2, [r10] +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: strb r2, [r6] +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: strb r2, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: strb r1, [r5] ; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: strb r1, [r4] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: strb r0, [r7] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs Index: llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -36,33 +36,27 @@ define arm_aapcs_vfpcc void @ptr_v8i32(<8 x i32> %v, <8 x i32*>* %offptr) { ; CHECK-LABEL: ptr_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: vmov r1, r2, d4 +; CHECK-NEXT: vmov lr, r12, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r0, r5, d1 +; CHECK-NEXT: str r3, [r1] +; CHECK-NEXT: vmov r1, r7, d4 +; CHECK-NEXT: str r4, [r2] +; CHECK-NEXT: vmov r2, r4, d5 +; CHECK-NEXT: str.w r0, [lr] +; CHECK-NEXT: vmov r0, r3, d2 +; CHECK-NEXT: str.w r5, [r12] +; CHECK-NEXT: vmov r5, r6, d3 +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: str r3, [r7] +; CHECK-NEXT: str r5, [r2] +; CHECK-NEXT: str r6, [r4] +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4 call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %v, <8 x i32*> %offs, i32 4, <8 x i1> ) @@ -73,62 +67,51 @@ define arm_aapcs_vfpcc void @ptr_v16i32(<16 x i32> %v, <16 x i32*>* %offptr) { ; CHECK-LABEL: ptr_v16i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q7, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vmov r3, r4, d0 ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] ; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s29 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s30 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s31 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s27 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov r1, s13 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r1, r2, d8 +; CHECK-NEXT: vmov lr, r12, d9 +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vmov r0, r5, d1 +; CHECK-NEXT: str r3, [r1] +; CHECK-NEXT: vmov r1, r3, d12 +; CHECK-NEXT: str r4, [r2] +; CHECK-NEXT: vmov r2, r7, d13 +; CHECK-NEXT: str.w r0, [lr] +; CHECK-NEXT: vmov r0, r4, d2 +; CHECK-NEXT: str.w r5, [r12] +; CHECK-NEXT: vmov r5, r6, d3 +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: str r4, [r3] +; CHECK-NEXT: vmov r3, r4, d11 +; CHECK-NEXT: str r5, [r2] +; CHECK-NEXT: vmov r2, r5, d4 +; CHECK-NEXT: str r6, [r7] +; CHECK-NEXT: vmov r7, r6, d5 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: vmov r0, r2, d8 +; CHECK-NEXT: str r5, [r1] +; CHECK-NEXT: vmov r1, r5, d9 +; CHECK-NEXT: str r7, [r3] +; CHECK-NEXT: vmov r3, r7, d6 +; CHECK-NEXT: str r6, [r4] +; CHECK-NEXT: vmov r6, r4, d7 +; CHECK-NEXT: str r3, [r0] +; CHECK-NEXT: str r7, [r2] +; CHECK-NEXT: str r6, [r1] +; CHECK-NEXT: str r4, [r5] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %v, <16 x i32*> %offs, i32 4, <16 x i1> ) @@ -170,24 +153,20 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov lr, s10 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: vstr s0, [r5] -; CHECK-NEXT: vstr s1, [r4] -; CHECK-NEXT: vstr s2, [r2] -; CHECK-NEXT: vstr s3, [r0] -; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: vstr s5, [r3] -; CHECK-NEXT: vstr s6, [lr] -; CHECK-NEXT: vstr s7, [r12] +; CHECK-NEXT: vmov r1, lr, d4 +; CHECK-NEXT: vmov r3, r12, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov r0, r2, d4 +; CHECK-NEXT: vmov r4, r5, d5 +; CHECK-NEXT: vstr s0, [r1] +; CHECK-NEXT: vstr s1, [lr] +; CHECK-NEXT: vstr s2, [r3] +; CHECK-NEXT: vstr s3, [r12] +; CHECK-NEXT: vstr s4, [r0] +; CHECK-NEXT: vstr s5, [r2] +; CHECK-NEXT: vstr s6, [r4] +; CHECK-NEXT: vstr s7, [r5] ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x float*>, <8 x float*>* %offptr, align 4 @@ -201,33 +180,31 @@ define arm_aapcs_vfpcc void @ptr_i16(<8 x i16> %v, <8 x i16*>* %offptr) { ; CHECK-LABEL: ptr_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vmov r1, r2, d2 +; CHECK-NEXT: vmov r3, r12, d3 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov r0, lr, d2 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: strh r6, [r1] ; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: strh r1, [r2] ; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: strh r1, [r3] ; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strh.w r1, [r12] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: strh.w r0, [lr] +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: strh r0, [r4] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: strh r0, [r5] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v, <8 x i16*> %offs, i32 2, <8 x i1> ) @@ -255,20 +232,18 @@ define arm_aapcs_vfpcc void @ptr_v4i16_trunc(<4 x i32> %v, <4 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v4i16_trunc: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: vmov r0, r12, d2 +; CHECK-NEXT: vmov r2, lr, d3 ; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: strh.w r3, [r12] +; CHECK-NEXT: strh r4, [r2] +; CHECK-NEXT: strh.w r5, [lr] +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 %ext = trunc <4 x i32> %v to <4 x i16> @@ -280,33 +255,27 @@ define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, <8 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v8i16_trunc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: vmov r1, r2, d4 +; CHECK-NEXT: vmov lr, r12, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r0, r5, d1 +; CHECK-NEXT: strh r3, [r1] +; CHECK-NEXT: vmov r1, r7, d4 +; CHECK-NEXT: strh r4, [r2] +; CHECK-NEXT: vmov r2, r4, d5 +; CHECK-NEXT: strh.w r0, [lr] +; CHECK-NEXT: vmov r0, r3, d2 +; CHECK-NEXT: strh.w r5, [r12] +; CHECK-NEXT: vmov r5, r6, d3 +; CHECK-NEXT: strh r0, [r1] +; CHECK-NEXT: strh r3, [r7] +; CHECK-NEXT: strh r5, [r2] +; CHECK-NEXT: strh r6, [r4] +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 %ext = trunc <8 x i32> %v to <8 x i16> @@ -323,25 +292,21 @@ ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vstr.16 s12, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vstr.16 s1, [r0] -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vstr.16 s12, [r1] +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vstr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vstr.16 s1, [r0] +; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vstr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vstr.16 s2, [r0] +; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vstr.16 s3, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x half*>, <8 x half*>* %offptr, align 4 @@ -355,62 +320,53 @@ define arm_aapcs_vfpcc void @ptr_i8(<16 x i8> %v, <16 x i8*>* %offptr) { ; CHECK-LABEL: ptr_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vmov.u8 r6, q0[0] +; CHECK-NEXT: vmov r1, r2, d2 +; CHECK-NEXT: vmov.u8 r5, q0[4] +; CHECK-NEXT: vmov r3, r12, d3 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov lr, r4, d4 +; CHECK-NEXT: vmov.u8 r7, q0[6] +; CHECK-NEXT: vmov r0, r8, d5 +; CHECK-NEXT: strb r6, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: strb r1, [r2] +; CHECK-NEXT: vmov.u8 r6, q0[2] +; CHECK-NEXT: vmov r1, r9, d6 +; CHECK-NEXT: strb r6, [r3] +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: strb.w r3, [r12] +; CHECK-NEXT: vmov r3, r6, d7 +; CHECK-NEXT: strb.w r5, [lr] +; CHECK-NEXT: vmov.u8 r5, q0[5] +; CHECK-NEXT: strb r5, [r4] +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: strb.w r0, [r8] +; CHECK-NEXT: vmov r0, r7, d3 +; CHECK-NEXT: strb r2, [r1] ; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: strb.w r1, [r9] ; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: strb r1, [r3] ; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strb r1, [r6] ; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: strb r1, [r5] ; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: strb r1, [r4] ; CHECK-NEXT: vmov.u8 r1, q0[14] ; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: strb r0, [r7] +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %offs = load <16 x i8*>, <16 x i8*>* %offptr, align 4 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v, <16 x i8*> %offs, i32 2, <16 x i1> ) @@ -421,33 +377,31 @@ define arm_aapcs_vfpcc void @ptr_v8i8_trunc16(<8 x i16> %v, <8 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v8i8_trunc16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vmov r1, r2, d2 +; CHECK-NEXT: vmov r3, r12, d3 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov r0, lr, d2 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: strb r6, [r1] ; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: strb r1, [r2] ; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: strb r1, [r3] ; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strb.w r1, [r12] ; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: strb.w r0, [lr] +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: strb r0, [r4] +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: strb r0, [r5] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 %ext = trunc <8 x i16> %v to <8 x i8> @@ -459,20 +413,18 @@ define arm_aapcs_vfpcc void @ptr_v4i8_trunc32(<4 x i32> %v, <4 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v4i8_trunc32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: vmov r0, r12, d2 +; CHECK-NEXT: vmov r2, lr, d3 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: strb.w r3, [r12] +; CHECK-NEXT: strb r4, [r2] +; CHECK-NEXT: strb.w r5, [lr] +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 %ext = trunc <4 x i32> %v to <4 x i8> @@ -484,33 +436,27 @@ define arm_aapcs_vfpcc void @ptr_v8i8_trunc32(<8 x i32> %v, <8 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v8i8_trunc32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: vmov r1, r2, d4 +; CHECK-NEXT: vmov lr, r12, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r0, r5, d1 +; CHECK-NEXT: strb r3, [r1] +; CHECK-NEXT: vmov r1, r7, d4 +; CHECK-NEXT: strb r4, [r2] +; CHECK-NEXT: vmov r2, r4, d5 +; CHECK-NEXT: strb.w r0, [lr] +; CHECK-NEXT: vmov r0, r3, d2 +; CHECK-NEXT: strb.w r5, [r12] +; CHECK-NEXT: vmov r5, r6, d3 +; CHECK-NEXT: strb r0, [r1] +; CHECK-NEXT: strb r3, [r7] +; CHECK-NEXT: strb r5, [r2] +; CHECK-NEXT: strb r6, [r4] +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 %ext = trunc <8 x i32> %v to <8 x i8> Index: llvm/test/CodeGen/Thumb2/mve-sext.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-sext.ll +++ llvm/test/CodeGen/Thumb2/mve-sext.ll @@ -78,15 +78,12 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i64_v2i64_v2i35(<2 x i64> %m) { ; CHECK-LABEL: sext_v2i64_v2i64_v2i35: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: sbfx r0, r0, #0, #3 -; CHECK-NEXT: sbfx r1, r1, #0, #3 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: sbfx r0, r1, #0, #3 +; CHECK-NEXT: sbfx r1, r3, #0, #3 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: bx lr entry: %shl = shl <2 x i64> %m, Index: llvm/test/CodeGen/Thumb2/mve-shifts.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shifts.ll +++ llvm/test/CodeGen/Thumb2/mve-shifts.ll @@ -34,17 +34,17 @@ define arm_aapcs_vfpcc <2 x i64> @shl_qq_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: shl_qq_int64_t: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r5, lr} +; CHECK-NEXT: push {r5, lr} ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, r1, d1 ; CHECK-NEXT: lsll r2, r1, r0 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: lsll r0, r3, r12 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r0, r5, d0 +; CHECK-NEXT: lsll r0, r5, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: pop {r5, pc} entry: %0 = shl <2 x i64> %src1, %src2 ret <2 x i64> %0 @@ -91,12 +91,10 @@ ; CHECK-NEXT: push {r5, lr} ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r0, r5, d1 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: lsll r0, r5, r2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: lsll r2, r3, r1 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 @@ -144,17 +142,17 @@ define arm_aapcs_vfpcc <2 x i64> @shrs_qq_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: shrs_qq_int64_t: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r5, lr} +; CHECK-NEXT: push {r5, lr} ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, r1, d1 ; CHECK-NEXT: asrl r2, r1, r0 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: asrl r0, r3, r12 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r0, r5, d0 +; CHECK-NEXT: asrl r0, r5, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: pop {r5, pc} entry: %0 = ashr <2 x i64> %src1, %src2 ret <2 x i64> %0 @@ -194,11 +192,9 @@ define arm_aapcs_vfpcc <2 x i64> @shl_qi_int64_t(<2 x i64> %src1) { ; CHECK-LABEL: shl_qi_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: lsll r0, r1, #4 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: lsll r2, r3, #4 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 @@ -242,11 +238,9 @@ define arm_aapcs_vfpcc <2 x i64> @shru_qi_int64_t(<2 x i64> %src1) { ; CHECK-LABEL: shru_qi_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: lsrl r0, r1, #4 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: lsrl r2, r3, #4 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 @@ -290,11 +284,9 @@ define arm_aapcs_vfpcc <2 x i64> @shrs_qi_int64_t(<2 x i64> %src1) { ; CHECK-LABEL: shrs_qi_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: asrl r0, r1, #4 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: asrl r2, r3, #4 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 @@ -344,11 +336,9 @@ define arm_aapcs_vfpcc <2 x i64> @shl_qr_int64_t(<2 x i64> %src1, i64 %src2) { ; CHECK-LABEL: shl_qr_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r12, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: lsll r12, r1, r0 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: lsll r2, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 @@ -403,16 +393,16 @@ define arm_aapcs_vfpcc <2 x i64> @shru_qr_int64_t(<2 x i64> %src1, i64 %src2) { ; CHECK-LABEL: shru_qr_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: rsb.w r12, r0, #0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: lsll r2, r1, r12 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: lsll r0, r3, r12 +; CHECK-NEXT: .save {r5, lr} +; CHECK-NEXT: push {r5, lr} +; CHECK-NEXT: rsbs r3, r0, #0 +; CHECK-NEXT: vmov r2, r1, d1 +; CHECK-NEXT: vmov r0, r5, d0 +; CHECK-NEXT: lsll r2, r1, r3 +; CHECK-NEXT: lsll r0, r5, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: pop {r5, pc} entry: %i = insertelement <2 x i64> undef, i64 %src2, i32 0 %s = shufflevector <2 x i64> %i, <2 x i64> undef, <2 x i32> zeroinitializer @@ -463,11 +453,9 @@ define arm_aapcs_vfpcc <2 x i64> @shrs_qr_int64_t(<2 x i64> %src1, i64 %src2) { ; CHECK-LABEL: shrs_qr_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r12, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: asrl r12, r1, r0 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: asrl r2, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 Index: llvm/test/CodeGen/Thumb2/mve-shuffle.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -1678,8 +1678,7 @@ define arm_aapcs_vfpcc i64 @extract_i64_0(<2 x i64> %a) { ; CHECK-LABEL: extract_i64_0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: bx lr entry: %res = extractelement <2 x i64> %a, i32 0 @@ -1689,8 +1688,7 @@ define arm_aapcs_vfpcc i64 @extract_i64_1(<2 x i64> %a) { ; CHECK-LABEL: extract_i64_1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: bx lr entry: %res = extractelement <2 x i64> %a, i32 1 Index: llvm/test/CodeGen/Thumb2/mve-simple-arith.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-simple-arith.ll +++ llvm/test/CodeGen/Thumb2/mve-simple-arith.ll @@ -35,23 +35,19 @@ define arm_aapcs_vfpcc <2 x i64> @add_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: add_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: vmov q0[3], q0[1], r1, r12 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r1, r0, d2 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds r1, r1, r4 +; CHECK-NEXT: adcs r0, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r3 +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = add nsw <2 x i64> %src1, %src2 ret <2 x i64> %0 @@ -172,23 +168,19 @@ define arm_aapcs_vfpcc <2 x i64> @sub_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: sub_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: subs.w lr, r3, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: sbc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: subs r0, r2, r0 -; CHECK-NEXT: sbc.w r1, r3, r1 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: vmov q0[3], q0[1], r1, r12 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov r1, r0, d0 +; CHECK-NEXT: vmov r4, r5, d2 +; CHECK-NEXT: subs.w r2, r2, lr +; CHECK-NEXT: sbc.w r3, r3, r12 +; CHECK-NEXT: subs r1, r4, r1 +; CHECK-NEXT: sbc.w r0, r5, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r3 +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %0 = sub nsw <2 x i64> %src2, %src1 ret <2 x i64> %0 @@ -309,25 +301,21 @@ define arm_aapcs_vfpcc <2 x i64> @mul_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: mul_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: umull r12, r3, r1, r0 -; CHECK-NEXT: mla lr, r1, r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: umull r4, r5, r1, r3 -; CHECK-NEXT: mla r1, r1, r2, r5 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: mla r0, r2, r0, lr -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r12 -; CHECK-NEXT: mla r1, r2, r3, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, lr, d0 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: umull r12, r3, r2, r0 +; CHECK-NEXT: mla r1, r2, r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: mla r0, lr, r0, r1 +; CHECK-NEXT: umull r6, r7, r2, r4 +; CHECK-NEXT: mla r2, r2, r5, r7 +; CHECK-NEXT: vmov q0[2], q0[0], r12, r6 +; CHECK-NEXT: mla r2, r3, r4, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %0 = mul nsw <2 x i64> %src1, %src2 ret <2 x i64> %0 Index: llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll +++ llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll @@ -91,21 +91,19 @@ } define <2 x i64> @vector_add_i64(<2 x i64> %lhs, <2 x i64> %rhs) { -; CHECK-LE-LABEL: vector_add_i64: -; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r7, lr} -; CHECK-LE-NEXT: push {r7, lr} -; CHECK-LE-NEXT: add.w r12, sp, #8 -; CHECK-LE-NEXT: vldrw.u32 q0, [r12] -; CHECK-LE-NEXT: vmov lr, s0 -; CHECK-LE-NEXT: vmov r12, s1 -; CHECK-LE-NEXT: adds.w r0, r0, lr -; CHECK-LE-NEXT: vmov lr, s2 -; CHECK-LE-NEXT: adc.w r1, r1, r12 -; CHECK-LE-NEXT: vmov r12, s3 -; CHECK-LE-NEXT: adds.w r2, r2, lr -; CHECK-LE-NEXT: adc.w r3, r3, r12 -; CHECK-LE-NEXT: pop {r7, pc} +; CHECK-MVE-LABEL: vector_add_i64: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: .save {r7, lr} +; CHECK-MVE-NEXT: push {r7, lr} +; CHECK-MVE-NEXT: add.w r12, sp, #8 +; CHECK-MVE-NEXT: vldrw.u32 q0, [r12] +; CHECK-MVE-NEXT: vmov r12, lr, d0 +; CHECK-MVE-NEXT: adds.w r0, r0, r12 +; CHECK-MVE-NEXT: adc.w r1, r1, lr +; CHECK-MVE-NEXT: vmov r12, lr, d1 +; CHECK-MVE-NEXT: adds.w r2, r2, r12 +; CHECK-MVE-NEXT: adc.w r3, r3, lr +; CHECK-MVE-NEXT: pop {r7, pc} ; ; CHECK-BE-LABEL: vector_add_i64: ; CHECK-BE: @ %bb.0: @ %entry @@ -113,15 +111,27 @@ ; CHECK-BE-NEXT: push {r7, lr} ; CHECK-BE-NEXT: add.w r12, sp, #8 ; CHECK-BE-NEXT: vldrw.u32 q0, [r12] -; CHECK-BE-NEXT: vmov lr, s1 -; CHECK-BE-NEXT: vmov r12, s0 +; CHECK-BE-NEXT: vmov r12, lr, d0 ; CHECK-BE-NEXT: adds.w r1, r1, lr -; CHECK-BE-NEXT: vmov lr, s3 ; CHECK-BE-NEXT: adc.w r0, r0, r12 -; CHECK-BE-NEXT: vmov r12, s2 +; CHECK-BE-NEXT: vmov r12, lr, d1 ; CHECK-BE-NEXT: adds.w r3, r3, lr ; CHECK-BE-NEXT: adc.w r2, r2, r12 ; CHECK-BE-NEXT: pop {r7, pc} +; +; CHECK-FP-LABEL: vector_add_i64: +; CHECK-FP: @ %bb.0: @ %entry +; CHECK-FP-NEXT: .save {r4, r5, r7, lr} +; CHECK-FP-NEXT: push {r4, r5, r7, lr} +; CHECK-FP-NEXT: add.w r12, sp, #16 +; CHECK-FP-NEXT: vldrw.u32 q0, [r12] +; CHECK-FP-NEXT: vmov r12, lr, d0 +; CHECK-FP-NEXT: vmov r4, r5, d1 +; CHECK-FP-NEXT: adds.w r0, r0, r12 +; CHECK-FP-NEXT: adc.w r1, r1, lr +; CHECK-FP-NEXT: adds r2, r2, r4 +; CHECK-FP-NEXT: adcs r3, r5 +; CHECK-FP-NEXT: pop {r4, r5, r7, pc} entry: %sum = add <2 x i64> %lhs, %rhs ret <2 x i64> %sum @@ -338,67 +348,67 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) { ; CHECK-MVE-LABEL: vector_add_f32: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .save {r7, lr} -; CHECK-MVE-NEXT: push {r7, lr} +; CHECK-MVE-NEXT: .save {r4, r5, r7, lr} +; CHECK-MVE-NEXT: push {r4, r5, r7, lr} ; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-MVE-NEXT: vmov d11, r2, r3 -; CHECK-MVE-NEXT: vmov d10, r0, r1 -; CHECK-MVE-NEXT: add r1, sp, #56 -; CHECK-MVE-NEXT: vldrw.u32 q6, [r1] -; CHECK-MVE-NEXT: vmov r0, s23 -; CHECK-MVE-NEXT: vmov r1, s27 +; CHECK-MVE-NEXT: vmov d13, r2, r3 +; CHECK-MVE-NEXT: vmov d12, r0, r1 +; CHECK-MVE-NEXT: add r1, sp, #64 +; CHECK-MVE-NEXT: vldrw.u32 q5, [r1] +; CHECK-MVE-NEXT: vmov r4, r0, d13 +; CHECK-MVE-NEXT: vmov r5, r1, d11 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: vmov s19, r0 -; CHECK-MVE-NEXT: vmov r0, s22 -; CHECK-MVE-NEXT: vmov r1, s26 +; CHECK-MVE-NEXT: mov r0, r4 +; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: vmov s18, r0 -; CHECK-MVE-NEXT: vmov r0, s21 -; CHECK-MVE-NEXT: vmov r1, s25 +; CHECK-MVE-NEXT: vmov r4, r0, d12 +; CHECK-MVE-NEXT: vmov r5, r1, d10 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: vmov s17, r0 -; CHECK-MVE-NEXT: vmov r0, s20 -; CHECK-MVE-NEXT: vmov r1, s24 +; CHECK-MVE-NEXT: mov r0, r4 +; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: vmov s16, r0 ; CHECK-MVE-NEXT: vmov r2, r3, d9 ; CHECK-MVE-NEXT: vmov r0, r1, d8 ; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-MVE-NEXT: pop {r7, pc} +; CHECK-MVE-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-BE-LABEL: vector_add_f32: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .save {r7, lr} -; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .save {r4, r5, r7, lr} +; CHECK-BE-NEXT: push {r4, r5, r7, lr} ; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-BE-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 -; CHECK-BE-NEXT: add r1, sp, #56 +; CHECK-BE-NEXT: add r1, sp, #64 ; CHECK-BE-NEXT: vldrw.u32 q6, [r1] ; CHECK-BE-NEXT: vrev64.32 q5, q0 -; CHECK-BE-NEXT: vmov r0, s23 -; CHECK-BE-NEXT: vmov r1, s27 +; CHECK-BE-NEXT: vmov r4, r0, d11 +; CHECK-BE-NEXT: vmov r5, r1, d13 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: vmov s19, r0 -; CHECK-BE-NEXT: vmov r0, s22 -; CHECK-BE-NEXT: vmov r1, s26 +; CHECK-BE-NEXT: mov r0, r4 +; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: vmov s18, r0 -; CHECK-BE-NEXT: vmov r0, s21 -; CHECK-BE-NEXT: vmov r1, s25 +; CHECK-BE-NEXT: vmov r4, r0, d10 +; CHECK-BE-NEXT: vmov r5, r1, d12 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: vmov s17, r0 -; CHECK-BE-NEXT: vmov r0, s20 -; CHECK-BE-NEXT: vmov r1, s24 +; CHECK-BE-NEXT: mov r0, r4 +; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: vmov s16, r0 ; CHECK-BE-NEXT: vrev64.32 q0, q4 ; CHECK-BE-NEXT: vmov r1, r0, d0 ; CHECK-BE-NEXT: vmov r3, r2, d1 ; CHECK-BE-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-BE-NEXT: pop {r7, pc} +; CHECK-BE-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-FP-LABEL: vector_add_f32: ; CHECK-FP: @ %bb.0: @ %entry Index: llvm/test/CodeGen/Thumb2/mve-vabd.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vabd.ll +++ llvm/test/CodeGen/Thumb2/mve-vabd.ll @@ -5,42 +5,42 @@ define arm_aapcs_vfpcc void @vabd_v4f32(<4 x float> %x, <4 x float> %y, <4 x float>* %z) { ; CHECK-MVE-LABEL: vabd_v4f32: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-MVE-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-MVE-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-MVE-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-MVE-NEXT: .pad #4 ; CHECK-MVE-NEXT: sub sp, #4 ; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} ; CHECK-MVE-NEXT: vmov q4, q1 ; CHECK-MVE-NEXT: vmov q5, q0 -; CHECK-MVE-NEXT: mov r4, r0 -; CHECK-MVE-NEXT: vmov r0, s20 -; CHECK-MVE-NEXT: vmov r1, s16 +; CHECK-MVE-NEXT: mov r8, r0 +; CHECK-MVE-NEXT: vmov r0, r6, d10 +; CHECK-MVE-NEXT: vmov r1, r7, d8 ; CHECK-MVE-NEXT: bl __aeabi_fsub -; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: vmov r0, s21 -; CHECK-MVE-NEXT: vmov r1, s17 +; CHECK-MVE-NEXT: mov r9, r0 +; CHECK-MVE-NEXT: mov r0, r6 +; CHECK-MVE-NEXT: mov r1, r7 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: mov r6, r0 -; CHECK-MVE-NEXT: vmov r0, s22 -; CHECK-MVE-NEXT: vmov r1, s18 +; CHECK-MVE-NEXT: vmov r0, r7, d11 +; CHECK-MVE-NEXT: vmov r1, r4, d9 ; CHECK-MVE-NEXT: bl __aeabi_fsub -; CHECK-MVE-NEXT: mov r7, r0 -; CHECK-MVE-NEXT: vmov r0, s23 -; CHECK-MVE-NEXT: vmov r1, s19 +; CHECK-MVE-NEXT: mov r5, r0 +; CHECK-MVE-NEXT: mov r0, r7 +; CHECK-MVE-NEXT: mov r1, r4 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 ; CHECK-MVE-NEXT: vmov s3, r0 -; CHECK-MVE-NEXT: bic r0, r7, #-2147483648 +; CHECK-MVE-NEXT: bic r0, r5, #-2147483648 ; CHECK-MVE-NEXT: vmov s2, r0 ; CHECK-MVE-NEXT: bic r0, r6, #-2147483648 ; CHECK-MVE-NEXT: vmov s1, r0 -; CHECK-MVE-NEXT: bic r0, r5, #-2147483648 +; CHECK-MVE-NEXT: bic r0, r9, #-2147483648 ; CHECK-MVE-NEXT: vmov s0, r0 -; CHECK-MVE-NEXT: vstrw.32 q0, [r4] +; CHECK-MVE-NEXT: vstrw.32 q0, [r8] ; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} ; CHECK-MVE-NEXT: add sp, #4 -; CHECK-MVE-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-MVE-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} ; ; CHECK-MVEFP-LABEL: vabd_v4f32: ; CHECK-MVEFP: @ %bb.0: @ %entry Index: llvm/test/CodeGen/Thumb2/mve-vabdus.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -386,57 +386,49 @@ ; CHECK-NEXT: vmov.f32 s14, s9 ; CHECK-NEXT: vand q4, q3, q0 ; CHECK-NEXT: vldrw.u32 q3, [r0], #16 -; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov r3, r4, d8 ; CHECK-NEXT: vmov.f32 s20, s12 ; CHECK-NEXT: vmov.f32 s22, s13 ; CHECK-NEXT: vand q5, q5, q0 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmov r6, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: vmov r7, s23 -; CHECK-NEXT: subs.w r8, r6, r3 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: sbc.w r4, r5, r4 -; CHECK-NEXT: vmov r6, s19 +; CHECK-NEXT: vmov r5, r6, d10 +; CHECK-NEXT: subs.w r8, r5, r3 +; CHECK-NEXT: vmov r7, r3, d11 +; CHECK-NEXT: sbc.w r4, r6, r4 ; CHECK-NEXT: asrs r5, r4, #31 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: bfi r4, r5, #0, #4 -; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r5, r6, d9 ; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: vmov.f32 s18, s11 ; CHECK-NEXT: vand q2, q4, q0 ; CHECK-NEXT: vmov.f32 s16, s14 ; CHECK-NEXT: vmov.f32 s18, s15 ; CHECK-NEXT: vand q3, q4, q0 -; CHECK-NEXT: vmov r12, s12 -; CHECK-NEXT: subs.w r9, r3, r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: sbc.w r3, r7, r6 -; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: vmov r6, s15 +; CHECK-NEXT: subs.w r9, r7, r5 +; CHECK-NEXT: mov.w r7, #1 +; CHECK-NEXT: sbcs r3, r6 ; CHECK-NEXT: and.w r3, r7, r3, asr #31 -; CHECK-NEXT: vmov r7, s10 +; CHECK-NEXT: vmov r7, r5, d7 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: bfi r4, r3, #4, #4 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: subs.w r10, r5, r7 -; CHECK-NEXT: vmov r7, s9 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: sbc.w r3, r6, r3 -; CHECK-NEXT: vmov r6, s8 -; CHECK-NEXT: asr.w r11, r3, #31 -; CHECK-NEXT: subs.w r6, r12, r6 -; CHECK-NEXT: sbc.w r7, r5, r7 -; CHECK-NEXT: asrs r7, r7, #31 -; CHECK-NEXT: vmov q2[2], q2[0], r7, r11 -; CHECK-NEXT: vmov r7, s8 +; CHECK-NEXT: vmov r3, r6, d5 +; CHECK-NEXT: subs.w r10, r7, r3 +; CHECK-NEXT: vmov r7, r3, d4 +; CHECK-NEXT: sbcs r5, r6 +; CHECK-NEXT: vmov r6, r12, d6 +; CHECK-NEXT: asr.w r11, r5, #31 +; CHECK-NEXT: subs r6, r6, r7 +; CHECK-NEXT: sbc.w r3, r12, r3 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q2[2], q2[0], r3, r11 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov q2[2], q2[0], r8, r6 ; CHECK-NEXT: vmov q2[3], q2[1], r9, r10 -; CHECK-NEXT: and r7, r7, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r4, r7, #8, #4 -; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: and.w r3, r7, r3, asr #31 +; CHECK-NEXT: and r3, r3, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r4, r3, #8, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: and.w r3, r3, r5, asr #31 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: bfi r4, r3, #12, #4 ; CHECK-NEXT: vmsr p0, r4 Index: llvm/test/CodeGen/Thumb2/mve-vaddv.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vaddv.ll +++ llvm/test/CodeGen/Thumb2/mve-vaddv.ll @@ -12,12 +12,10 @@ define arm_aapcs_vfpcc i64 @vaddv_v2i64_i64(<2 x i64> %s1) { ; CHECK-LABEL: vaddv_v2i64_i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %r = call i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64> %s1) @@ -92,14 +90,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %t = call i64 @llvm.vector.reduce.add.i64.v2i64(<2 x i64> %s1) Index: llvm/test/CodeGen/Thumb2/mve-vcmp.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vcmp.ll +++ llvm/test/CodeGen/Thumb2/mve-vcmp.ll @@ -367,22 +367,18 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, <2 x i64> %srcb, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_eq_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: eors r1, r3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r12, r2, d2 +; CHECK-NEXT: vmov r3, r1, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: eors r2, r3 +; CHECK-NEXT: eor.w r2, r3, r12 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 @@ -402,22 +398,18 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, <2 x i64> %srcb, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: vcmp_eq_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: eors r1, r3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r12, r2, d2 +; CHECK-NEXT: vmov r3, r1, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: eors r2, r3 +; CHECK-NEXT: eor.w r2, r3, r12 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 @@ -441,12 +433,10 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne Index: llvm/test/CodeGen/Thumb2/mve-vcmpr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vcmpr.ll +++ llvm/test/CodeGen/Thumb2/mve-vcmpr.ll @@ -433,18 +433,16 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_eq_v2i64(<2 x i64> %src, i64 %src2, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_eq_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: eors r2, r1 -; CHECK-NEXT: eors r3, r0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: eors r3, r1 +; CHECK-NEXT: eors r2, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r12, r3, d0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: eor.w r0, r0, r12 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 @@ -466,18 +464,16 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_eq_v2i32(<2 x i64> %src, i64 %src2, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: vcmp_eq_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: eors r2, r1 -; CHECK-NEXT: eors r3, r0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: eors r3, r1 +; CHECK-NEXT: eors r2, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r12, r3, d0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: eor.w r0, r0, r12 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 @@ -503,12 +499,10 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -1014,18 +1008,16 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eq_v2i64(<2 x i64> %src, i64 %src2, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_r_eq_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: eors r2, r1 -; CHECK-NEXT: eors r3, r0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: eors r3, r1 +; CHECK-NEXT: eors r2, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r12, r3, d0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: eor.w r0, r0, r12 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 @@ -1047,18 +1039,16 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eq_v2i32(<2 x i64> %src, i64 %src2, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: vcmp_r_eq_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: eors r2, r1 -; CHECK-NEXT: eors r3, r0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: eors r3, r1 +; CHECK-NEXT: eors r2, r0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r12, r3, d0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: eors r1, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: eors r0, r3 +; CHECK-NEXT: eor.w r0, r0, r12 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 @@ -1084,12 +1074,10 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne Index: llvm/test/CodeGen/Thumb2/mve-vcmpz.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vcmpz.ll +++ llvm/test/CodeGen/Thumb2/mve-vcmpz.ll @@ -361,11 +361,9 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_eqz_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -388,11 +386,9 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: vcmp_eqz_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -775,11 +771,9 @@ define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: vcmp_r_eqz_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -802,11 +796,9 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: vcmp_r_eqz_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne Index: llvm/test/CodeGen/Thumb2/mve-vcvt.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vcvt.ll +++ llvm/test/CodeGen/Thumb2/mve-vcvt.ll @@ -266,11 +266,9 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s19 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 @@ -292,11 +290,9 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s19 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_ul2d -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -38,12 +38,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q1, #0xffffffff ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -54,14 +52,16 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x) { ; CHECK-LABEL: add_v2i32_v2i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 +; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: adc.w r1, r3, r1, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i32> %x to <2 x i64> @@ -134,40 +134,36 @@ ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i16> %x to <8 x i64> @@ -181,46 +177,49 @@ ; CHECK-NEXT: vmov.s16 r0, q0[1] ; CHECK-NEXT: vmov.s16 r1, q0[0] ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.s16 r2, q0[3] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.s16 r3, q0[2] -; CHECK-NEXT: adc.w r12, r1, r0, asr #31 -; CHECK-NEXT: vmov.s16 r1, q0[3] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.s16 r2, q0[5] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.s16 r3, q0[4] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.s16 r1, q0[5] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.s16 r2, q0[7] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.s16 r3, q0[6] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asrs r0, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.s16 r3, q0[7] -; CHECK-NEXT: adc.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov.s16 r1, q0[6] -; CHECK-NEXT: adds r2, r2, r1 -; CHECK-NEXT: adc.w r1, r0, r1, asr #31 -; CHECK-NEXT: adds r0, r2, r3 -; CHECK-NEXT: adc.w r1, r1, r3, asr #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r0 +; CHECK-NEXT: vmov r0, r3, d0 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i16> %x to <8 x i64> @@ -257,10 +256,9 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -271,13 +269,18 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x) { ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: sxth r1, r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: asrs r2, r1, #31 ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: adc.w r1, r3, r1, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i16> %x to <2 x i64> @@ -442,88 +445,76 @@ ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov.u8 r3, q0[2] +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u8 r3, q0[8] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u8 r3, q0[10] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u8 r3, q0[12] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u8 r3, q0[14] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i64> @@ -537,98 +528,97 @@ ; CHECK-NEXT: vmov.s8 r0, q0[1] ; CHECK-NEXT: vmov.s8 r1, q0[0] ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.s8 r2, q0[3] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.s8 r3, q0[2] -; CHECK-NEXT: adc.w r12, r1, r0, asr #31 -; CHECK-NEXT: vmov.s8 r1, q0[3] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.s8 r2, q0[5] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.s8 r3, q0[4] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.s8 r1, q0[5] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.s8 r2, q0[7] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.s8 r3, q0[6] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.s8 r1, q0[7] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.s8 r2, q0[9] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.s8 r3, q0[8] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.s8 r1, q0[9] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.s8 r2, q0[11] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.s8 r3, q0[10] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.s8 r1, q0[11] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.s8 r2, q0[13] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.s8 r3, q0[12] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.s8 r1, q0[13] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.s8 r3, q0[15] -; CHECK-NEXT: adc.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov.s8 r1, q0[14] -; CHECK-NEXT: adds r2, r2, r1 -; CHECK-NEXT: adc.w r1, r0, r1, asr #31 -; CHECK-NEXT: adds r0, r2, r3 -; CHECK-NEXT: adc.w r1, r1, r3, asr #31 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.s8 r2, q0[15] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.s8 r3, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asrs r0, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r0 +; CHECK-NEXT: vmov r0, r3, d0 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i64> @@ -644,41 +634,37 @@ ; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov.u16 r1, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.u16 r3, q0[2] ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i8> %x to <8 x i64> @@ -694,52 +680,55 @@ ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: asrs r2, r0, #31 +; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: adc.w r12, r1, r0, asr #31 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: adds r2, r2, r1 -; CHECK-NEXT: adc.w r1, r0, r1, asr #31 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: sxtb r3, r0 -; CHECK-NEXT: adds r0, r2, r3 -; CHECK-NEXT: adc.w r1, r1, r3, asr #31 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asrs r0, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r0 +; CHECK-NEXT: vmov r0, r3, d0 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i8> %x to <8 x i64> @@ -778,10 +767,9 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -792,13 +780,18 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x) { ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: sxtb r1, r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: asrs r2, r1, #31 ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: adc.w r1, r3, r1, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i8> %x to <2 x i64> @@ -809,12 +802,10 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x) { ; CHECK-LABEL: add_v2i64_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) @@ -863,14 +854,12 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q1, #0xffffffff ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -882,17 +871,21 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) { ; CHECK-LABEL: add_v2i32_v2i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: adc.w r2, r2, r3, asr #31 -; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc.w r2, lr, r2, asr #31 +; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <2 x i32> %x to <2 x i64> %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) @@ -965,50 +958,46 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r2, r12, d5 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: add.w r12, r3, r2 +; CHECK-NEXT: add.w lr, r3, r2 ; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: add.w r12, r2, r3 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: add r2, lr +; CHECK-NEXT: add.w lr, r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add r12, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w r4, r12, r3 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add lr, r2 +; CHECK-NEXT: vmov r3, r2, d5 +; CHECK-NEXT: adds.w lr, lr, r3 ; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adc.w r12, r2, lr +; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <8 x i16> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -1024,48 +1013,51 @@ ; CHECK-NEXT: vmov.s16 r2, q0[1] ; CHECK-NEXT: vmov.s16 r3, q0[0] ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 -; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r12, s5 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r3, r2, d2 ; CHECK-NEXT: adds.w lr, lr, r3 ; CHECK-NEXT: vmov.s16 r3, q0[2] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 +; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov.s16 r2, q0[3] ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r4, r2, #31 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds.w lr, lr, r2 ; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov.s16 r4, q0[5] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.s16 r2, q0[4] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.s16 r2, q0[5] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.s16 r3, q0[4] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.s16 r2, q0[7] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.s16 r3, q0[6] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asrs r4, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adc.w r4, r4, r12 ; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4, asr #31 -; CHECK-NEXT: vmov.s16 r4, q0[6] -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 -; CHECK-NEXT: vmov.s16 r4, q0[7] -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc.w r2, r4, r2, asr #31 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r4, pc} entry: %xx = sext <8 x i16> %x to <8 x i64> @@ -1105,12 +1097,11 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, r12, d1 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc.w r1, r1, r12 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -1122,16 +1113,23 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: asr.w r12, r2, #31 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r3, asr #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: bx lr +; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc.w r2, lr, r2, asr #31 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <2 x i16> %x to <2 x i64> %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) @@ -1302,98 +1300,86 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r2, r12, d5 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: add.w r12, r3, r2 +; CHECK-NEXT: add.w lr, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[3] ; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: add.w r12, r2, r3 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: add r2, lr +; CHECK-NEXT: add.w lr, r2, r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add r12, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w r4, r12, r3 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add lr, r2 +; CHECK-NEXT: vmov r3, r2, d5 +; CHECK-NEXT: adds.w lr, lr, r3 ; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: adc.w r12, r2, lr +; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: adc.w r3, r12, r4 -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w lr, r4, r2 -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w lr, r4, r2 -; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w lr, r4, r2 -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r2, r2, lr ; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) @@ -1409,100 +1395,99 @@ ; CHECK-NEXT: vmov.s8 r2, q0[1] ; CHECK-NEXT: vmov.s8 r3, q0[0] ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 -; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r12, s5 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r3, r2, d2 ; CHECK-NEXT: adds.w lr, lr, r3 ; CHECK-NEXT: vmov.s8 r3, q0[2] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 +; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov.s8 r2, q0[3] ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r4, r2, #31 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.s8 r2, q0[5] ; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov.s8 r4, q0[5] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.s8 r2, q0[4] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 +; CHECK-NEXT: vmov.s8 r3, q0[4] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov.s8 r2, q0[6] -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.s8 r4, q0[7] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.s8 r2, q0[7] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.s8 r3, q0[6] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov.s8 r2, q0[8] -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.s8 r4, q0[9] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.s8 r2, q0[9] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.s8 r3, q0[8] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov.s8 r2, q0[10] -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.s8 r4, q0[11] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.s8 r2, q0[11] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.s8 r3, q0[10] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov.s8 r2, q0[12] -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.s8 r4, q0[13] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.s8 r2, q0[13] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.s8 r3, q0[12] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.s8 r2, q0[15] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.s8 r3, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asrs r4, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adc.w r4, r4, r12 ; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4, asr #31 -; CHECK-NEXT: vmov.s8 r4, q0[14] -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 -; CHECK-NEXT: vmov.s8 r4, q0[15] -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc.w r2, r4, r2, asr #31 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r4, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> @@ -1514,51 +1499,47 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v8i8_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r2, r12, d5 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: add.w r12, r3, r2 +; CHECK-NEXT: add.w lr, r3, r2 ; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: add.w r12, r2, r3 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: add r2, lr +; CHECK-NEXT: add.w lr, r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add r12, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w r4, r12, r3 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: add lr, r2 +; CHECK-NEXT: vmov r3, r2, d5 +; CHECK-NEXT: adds.w lr, lr, r3 ; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adc.w r12, r2, lr +; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <8 x i8> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -1576,54 +1557,57 @@ ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 -; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r12, s5 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r3, r2, d2 ; CHECK-NEXT: adds.w lr, lr, r3 ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 +; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r4, r2, #31 +; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds.w lr, lr, r2 ; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.u16 r3, q0[4] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asrs r4, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adc.w r4, r4, r12 ; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4, asr #31 -; CHECK-NEXT: vmov.u16 r4, q0[6] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 -; CHECK-NEXT: vmov.u16 r4, q0[7] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc.w r2, r4, r2, asr #31 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r4, pc} entry: %xx = sext <8 x i8> %x to <8 x i64> @@ -1665,12 +1649,11 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r2, r12, d1 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc.w r1, r1, r12 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -1682,16 +1665,23 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: asr.w r12, r2, #31 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r3, asr #31 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: bx lr +; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc.w r2, lr, r2, asr #31 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <2 x i8> %x to <2 x i64> %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) @@ -1704,14 +1694,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -60,12 +60,10 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -97,12 +95,10 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -215,12 +211,10 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r12, r2, d7 +; CHECK-NEXT: vmov r3, r1, d6 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: add.w r2, r3, r12 ; CHECK-NEXT: ubfx r3, r0, #12, #1 ; CHECK-NEXT: ubfx r0, r0, #8, #1 ; CHECK-NEXT: rsbs r3, r3, #0 @@ -232,24 +226,22 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov r0, r3, d6 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u16 r1, q2[6] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r1 -; CHECK-NEXT: vmov.u16 r1, q2[7] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r1 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: and r0, r1, #1 -; CHECK-NEXT: ubfx r3, r1, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r0, r2, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 @@ -259,33 +251,29 @@ ; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r12, r3 -; CHECK-NEXT: ubfx r3, r1, #12, #1 -; CHECK-NEXT: ubfx r1, r1, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov r0, r3, d4 +; CHECK-NEXT: adds.w r12, r12, r0 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r0, d5 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: ubfx r1, r2, #12, #1 +; CHECK-NEXT: ubfx r2, r2, #8, #1 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 ; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adcs r2, r0 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -324,10 +312,8 @@ ; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov r1, r12, d5 +; CHECK-NEXT: vmov r3, r2, d4 ; CHECK-NEXT: adds r1, r1, r3 ; CHECK-NEXT: ubfx r3, r0, #12, #1 ; CHECK-NEXT: ubfx r0, r0, #8, #1 @@ -343,15 +329,13 @@ ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q3[3], q3[1], r3, r0 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds.w r12, r1, r0 +; CHECK-NEXT: vmov r0, r3, d4 +; CHECK-NEXT: adds r0, r0, r1 ; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q1[4] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q1[7] @@ -372,35 +356,31 @@ ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r0 ; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r0, r3, d2 +; CHECK-NEXT: adds.w r12, r12, r0 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r0, d3 ; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: ubfx r1, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov.s16 r2, q0[7] -; CHECK-NEXT: vmov.s16 r3, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmov.s16 r1, q0[7] +; CHECK-NEXT: vmov.s16 r2, q0[6] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 +; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adcs r2, r0 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -461,12 +441,10 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -502,12 +480,10 @@ ; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -766,6 +742,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-LABEL: add_v16i8_v16i64_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vcmp.i8 eq, q1, zr @@ -810,12 +788,10 @@ ; CHECK-NEXT: vmov q7[2], q7[0], r2, r1 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r1, s27 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov r12, r2, d13 +; CHECK-NEXT: vmov r3, r1, d12 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: add.w r2, r3, r12 ; CHECK-NEXT: ubfx r3, r0, #12, #1 ; CHECK-NEXT: ubfx r0, r0, #8, #1 ; CHECK-NEXT: rsbs r3, r3, #0 @@ -827,24 +803,22 @@ ; CHECK-NEXT: vmov q7[2], q7[0], r3, r0 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: vmov r1, s27 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov r0, r3, d12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u16 r1, q5[6] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r1 -; CHECK-NEXT: vmov.u16 r1, q5[7] +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] -; CHECK-NEXT: vmov q6[3], q6[1], r3, r1 +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: and r0, r1, #1 -; CHECK-NEXT: ubfx r3, r1, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r0, r2, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r0, r3 @@ -854,33 +828,29 @@ ; CHECK-NEXT: vmov q6[2], q6[0], r3, r0 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r12, r3 -; CHECK-NEXT: ubfx r3, r1, #12, #1 -; CHECK-NEXT: ubfx r1, r1, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov r0, r3, d10 +; CHECK-NEXT: adds.w r12, r12, r0 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r0, d11 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: ubfx r1, r2, #12, #1 +; CHECK-NEXT: ubfx r2, r2, #8, #1 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r1, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r1, r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r1 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r1 ; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r1 +; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, r2, d10 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adcs r2, r0 +; CHECK-NEXT: vmov r0, r3, d11 +; CHECK-NEXT: adds.w r12, r1, r0 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -918,35 +888,31 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov r0, r3, d6 +; CHECK-NEXT: adds.w r12, r12, r0 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r0, d7 ; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: ubfx r1, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, r2, d6 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adcs r0, r2 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds r1, r1, r2 ; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: adc.w r12, r0, r3 +; CHECK-NEXT: vmov.u16 r3, q2[4] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.u16 r3, q2[5] @@ -964,35 +930,31 @@ ; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: vmov r0, r3, d4 +; CHECK-NEXT: adds.w lr, r1, r0 +; CHECK-NEXT: adc.w r1, r12, r3 +; CHECK-NEXT: vmov r3, r0, d5 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: ubfx r1, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adcs r2, r0 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> @@ -1049,10 +1011,8 @@ ; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: vmov q6[3], q6[1], r2, r1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: vmov r12, s23 -; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov r1, r12, d11 +; CHECK-NEXT: vmov r3, r2, d10 ; CHECK-NEXT: adds r1, r1, r3 ; CHECK-NEXT: ubfx r3, r0, #12, #1 ; CHECK-NEXT: ubfx r0, r0, #8, #1 @@ -1068,15 +1028,13 @@ ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r0 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: adds.w r12, r1, r0 +; CHECK-NEXT: vmov r0, r3, d10 +; CHECK-NEXT: adds r0, r0, r1 ; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q4[6] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q4[4] ; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q4[7] @@ -1097,35 +1055,31 @@ ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q5[3], q5[1], r3, r0 ; CHECK-NEXT: vand q4, q5, q4 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov r0, r3, d8 +; CHECK-NEXT: adds.w r12, r12, r0 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r0, d9 ; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s19 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: ubfx r1, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 -; CHECK-NEXT: vmov.s8 r2, q0[7] -; CHECK-NEXT: vmov.s8 r3, q0[6] -; CHECK-NEXT: vmov q5[2], q5[0], r3, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 +; CHECK-NEXT: vmov.s8 r1, q0[7] +; CHECK-NEXT: vmov.s8 r2, q0[6] +; CHECK-NEXT: vmov q5[2], q5[0], r2, r1 +; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q5[3], q5[1], r3, r2 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r1 ; CHECK-NEXT: vand q4, q5, q4 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s19 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, r2, d8 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adcs r2, r0 +; CHECK-NEXT: vmov r0, r3, d9 +; CHECK-NEXT: adds.w r12, r1, r0 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov.u8 r2, q3[8] ; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov.u8 r2, q3[9] @@ -1165,37 +1119,33 @@ ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q3[3], q3[1], r3, r0 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov r0, r3, d4 +; CHECK-NEXT: adds.w r12, r12, r0 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r0, d5 ; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: ubfx r1, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 -; CHECK-NEXT: vmov.s8 r2, q0[11] -; CHECK-NEXT: vmov.s8 r3, q0[10] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmov.s8 r1, q0[11] +; CHECK-NEXT: vmov.s8 r2, q0[10] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: vmov.u16 r3, q1[4] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, r2, d4 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adcs r2, r0 +; CHECK-NEXT: vmov r0, r3, d5 +; CHECK-NEXT: adds.w r12, r1, r0 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov.u16 r3, q1[4] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q1[7] ; CHECK-NEXT: vmov.u16 r3, q1[5] @@ -1215,35 +1165,31 @@ ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r0 ; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r0, r3, d2 +; CHECK-NEXT: adds.w r12, r12, r0 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r0, d3 ; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: ubfx r1, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov.s8 r2, q0[15] -; CHECK-NEXT: vmov.s8 r3, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmov.s8 r1, q0[15] +; CHECK-NEXT: vmov.s8 r2, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 +; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adcs r2, r0 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: @@ -1285,12 +1231,10 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r12, r2, d7 +; CHECK-NEXT: vmov r3, r1, d6 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: add.w r2, r3, r12 ; CHECK-NEXT: ubfx r3, r0, #12, #1 ; CHECK-NEXT: ubfx r0, r0, #8, #1 ; CHECK-NEXT: rsbs r3, r3, #0 @@ -1302,24 +1246,22 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov r0, r3, d6 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u16 r1, q2[6] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r1 -; CHECK-NEXT: vmov.u16 r1, q2[7] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r1 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: and r0, r1, #1 -; CHECK-NEXT: ubfx r3, r1, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r0, r2, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 @@ -1329,33 +1271,29 @@ ; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r12, r3 -; CHECK-NEXT: ubfx r3, r1, #12, #1 -; CHECK-NEXT: ubfx r1, r1, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov r0, r3, d4 +; CHECK-NEXT: adds.w r12, r12, r0 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r0, d5 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: ubfx r1, r2, #12, #1 +; CHECK-NEXT: ubfx r2, r2, #8, #1 ; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 ; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adcs r2, r0 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -1397,10 +1335,8 @@ ; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov r1, r12, d5 +; CHECK-NEXT: vmov r3, r2, d4 ; CHECK-NEXT: adds r1, r1, r3 ; CHECK-NEXT: ubfx r3, r0, #12, #1 ; CHECK-NEXT: ubfx r0, r0, #8, #1 @@ -1418,15 +1354,13 @@ ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q3[3], q3[1], r3, r0 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds.w r12, r1, r0 +; CHECK-NEXT: vmov r0, r3, d4 +; CHECK-NEXT: adds r0, r0, r1 ; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q1[4] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q1[7] @@ -1449,37 +1383,33 @@ ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r0 ; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r0, r3, d2 +; CHECK-NEXT: adds.w r12, r12, r0 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r0, d3 ; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: ubfx r1, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r3, r3, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 +; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: adds r1, r1, r3 +; CHECK-NEXT: adcs r2, r0 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i8> %b, zeroinitializer @@ -1543,12 +1473,10 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -1584,12 +1512,10 @@ ; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -1602,11 +1528,9 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %b) { ; CHECK-LABEL: add_v2i64_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r1, r2, d2 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -1617,12 +1541,10 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i64> %b, zeroinitializer @@ -1695,14 +1617,12 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -1737,14 +1657,12 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -1865,34 +1783,30 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r3, r12 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r12, s15 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vmov lr, s14 -; CHECK-NEXT: orr.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: add lr, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: vmov r12, lr, d7 +; CHECK-NEXT: vmov r3, r4, d6 +; CHECK-NEXT: orr.w lr, lr, r4 +; CHECK-NEXT: ubfx r4, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: add r3, r12 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r4 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r4 ; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov.u16 r4, q0[2] +; CHECK-NEXT: vmov q4[2], q4[0], r4, r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov r2, r4, d6 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: adc.w r3, lr, r4 +; CHECK-NEXT: vmov r4, r2, d7 +; CHECK-NEXT: adds.w lr, r12, r4 +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u16 r3, q2[6] +; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.u16 r3, q2[5] ; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 @@ -1909,18 +1823,16 @@ ; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: adds.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r4 +; CHECK-NEXT: vmov r3, r4, d5 +; CHECK-NEXT: adds.w lr, lr, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: adc.w r4, r4, r12 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] @@ -1928,16 +1840,14 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, r2, d1 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, pc} entry: @@ -1979,16 +1889,14 @@ ; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: adds r5, r4, r2 -; CHECK-NEXT: ubfx r4, lr, #12, #1 +; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: adds r5, r3, r2 ; CHECK-NEXT: ubfx r2, lr, #8, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adc.w r3, r4, r12 +; CHECK-NEXT: ubfx r4, lr, #12, #1 +; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r4 ; CHECK-NEXT: vmov.s16 r2, q0[3] @@ -1998,13 +1906,11 @@ ; CHECK-NEXT: asrs r4, r4, #31 ; CHECK-NEXT: vmov q3[3], q3[1], r4, r2 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: adcs r3, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r12, r5, r2 +; CHECK-NEXT: vmov r2, r4, d4 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: adds.w r12, r2, r5 ; CHECK-NEXT: vmov.u16 r5, q1[6] ; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: vmov.u16 r4, q1[4] @@ -2027,37 +1933,33 @@ ; CHECK-NEXT: asrs r4, r4, #31 ; CHECK-NEXT: vmov q2[3], q2[1], r4, r2 ; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r2, r4, d2 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, r2, d3 ; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: ubfx r4, r5, #12, #1 +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: ubfx r3, r5, #12, #1 ; CHECK-NEXT: ubfx r5, r5, #8, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r5, r5, #0 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r4 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r4 -; CHECK-NEXT: vmov.s16 r5, q0[7] -; CHECK-NEXT: vmov.s16 r4, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov q1[2], q1[0], r5, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 +; CHECK-NEXT: vmov.s16 r3, q0[7] +; CHECK-NEXT: vmov.s16 r5, q0[6] +; CHECK-NEXT: vmov q0[2], q0[0], r5, r3 +; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r5, d0 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r2, r5 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %c = icmp eq <8 x i16> %b, zeroinitializer @@ -2071,6 +1973,8 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %b, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xffff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 @@ -2087,15 +1991,13 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: orr.w r12, r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: orr.w r3, r3, lr ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: bx lr +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i16> %b, zeroinitializer %xx = zext <2 x i16> %x to <2 x i64> @@ -2133,14 +2035,12 @@ ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -2304,8 +2204,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vcmp.i8 eq, q1, zr @@ -2350,34 +2250,30 @@ ; CHECK-NEXT: vmov q7[2], q7[0], r3, r12 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r12, s27 -; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: vmov lr, s26 -; CHECK-NEXT: orr.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: add lr, r3 -; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: vmov r12, lr, d13 +; CHECK-NEXT: vmov r3, r4, d12 +; CHECK-NEXT: orr.w lr, lr, r4 +; CHECK-NEXT: ubfx r4, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: add r3, r12 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r4 +; CHECK-NEXT: vmov q6[3], q6[1], r2, r4 ; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov q7[2], q7[0], r3, r2 +; CHECK-NEXT: vmov.u8 r4, q0[2] +; CHECK-NEXT: vmov q7[2], q7[0], r4, r2 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov r2, r4, d12 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: adc.w r3, lr, r4 +; CHECK-NEXT: vmov r4, r2, d13 +; CHECK-NEXT: adds.w lr, r12, r4 +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u16 r3, q5[6] +; CHECK-NEXT: vmov.u16 r2, q5[4] +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 @@ -2394,18 +2290,16 @@ ; CHECK-NEXT: vmov q6[2], q6[0], r4, r3 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: adds.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, r4, d10 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r4 +; CHECK-NEXT: vmov r3, r4, d11 +; CHECK-NEXT: adds.w lr, lr, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: adc.w r4, r4, r12 ; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 ; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[7] @@ -2413,14 +2307,12 @@ ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: adc.w r4, r12, r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: adds.w r12, lr, r3 -; CHECK-NEXT: adc.w lr, r4, r2 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, r2, d11 +; CHECK-NEXT: adds.w lr, r12, r4 +; CHECK-NEXT: adc.w r12, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -2440,101 +2332,93 @@ ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q2, q3, q2 ; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.u16 r4, q2[0] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r2 +; CHECK-NEXT: vmov.u16 r3, q2[0] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.u16 r4, q2[1] -; CHECK-NEXT: vmov q3[3], q3[1], r4, r2 +; CHECK-NEXT: vmov.u16 r3, q2[1] +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r3, r2, #1 -; CHECK-NEXT: ubfx r4, r2, #4, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: and r4, r2, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r4 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 +; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 ; CHECK-NEXT: vmov.u8 r3, q0[9] ; CHECK-NEXT: vmov.u8 r4, q0[8] ; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds.w r5, r12, r4 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: adc.w r12, lr, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: ubfx r4, r2, #12, #1 +; CHECK-NEXT: vmov r3, r4, d6 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r4 +; CHECK-NEXT: vmov r3, r4, d7 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r4 +; CHECK-NEXT: adc.w r4, r4, r12 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r2 +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov.u16 r4, q2[4] -; CHECK-NEXT: adc.w r12, r2, r3 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, r2, d7 +; CHECK-NEXT: adds.w lr, r12, r4 +; CHECK-NEXT: adc.w r12, r3, r2 ; CHECK-NEXT: vmov.u16 r3, q2[6] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q2[7] -; CHECK-NEXT: vmov.u16 r4, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 +; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.u16 r3, q2[5] +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: ubfx r4, r3, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r4, r2, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r4 -; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 +; CHECK-NEXT: vmov.u8 r3, q0[13] ; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r2 +; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: adc.w r5, r12, r4 -; CHECK-NEXT: ubfx r4, r3, #12, #1 -; CHECK-NEXT: ubfx r3, r3, #8, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r4 +; CHECK-NEXT: vmov r3, r4, d5 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: ubfx r2, r2, #8, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: adc.w r4, r4, r12 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, r2, d1 +; CHECK-NEXT: adds.w r4, r4, r12 +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> @@ -2594,16 +2478,14 @@ ; CHECK-NEXT: asrs r2, r2, #31 ; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov r12, s23 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: adds r5, r4, r2 -; CHECK-NEXT: ubfx r4, lr, #12, #1 +; CHECK-NEXT: vmov r2, r12, d11 +; CHECK-NEXT: vmov r3, r4, d10 +; CHECK-NEXT: adds r5, r3, r2 ; CHECK-NEXT: ubfx r2, lr, #8, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adc.w r3, r4, r12 +; CHECK-NEXT: ubfx r4, lr, #12, #1 +; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r2, r4 ; CHECK-NEXT: vmov q5[3], q5[1], r2, r4 ; CHECK-NEXT: vmov.s8 r2, q0[3] @@ -2613,13 +2495,11 @@ ; CHECK-NEXT: asrs r4, r4, #31 ; CHECK-NEXT: vmov q6[3], q6[1], r4, r2 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s23 -; CHECK-NEXT: adcs r3, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: adds.w r12, r5, r2 +; CHECK-NEXT: vmov r2, r4, d10 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r5, r4, d11 +; CHECK-NEXT: adds.w r12, r2, r5 ; CHECK-NEXT: vmov.u16 r5, q4[6] ; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: vmov.u16 r4, q4[4] @@ -2642,35 +2522,31 @@ ; CHECK-NEXT: asrs r4, r4, #31 ; CHECK-NEXT: vmov q5[3], q5[1], r4, r2 ; CHECK-NEXT: vand q4, q5, q4 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov r2, r4, d8 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, r2, d9 ; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: ubfx r4, r5, #12, #1 +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: ubfx r3, r5, #12, #1 ; CHECK-NEXT: ubfx r5, r5, #8, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r5, r5, #0 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 -; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 -; CHECK-NEXT: vmov.s8 r5, q0[7] -; CHECK-NEXT: vmov.s8 r4, q0[6] -; CHECK-NEXT: vmov q5[2], q5[0], r4, r5 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov q4[2], q4[0], r5, r3 +; CHECK-NEXT: vmov q4[3], q4[1], r5, r3 +; CHECK-NEXT: vmov.s8 r3, q0[7] +; CHECK-NEXT: vmov.s8 r5, q0[6] +; CHECK-NEXT: vmov q5[2], q5[0], r5, r3 +; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r5 +; CHECK-NEXT: vmov q5[3], q5[1], r5, r3 ; CHECK-NEXT: vand q4, q5, q4 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r5, s17 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s19 -; CHECK-NEXT: adds.w r12, r2, r4 -; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r3, r5, d8 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r5, r2 +; CHECK-NEXT: vmov r2, r4, d9 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: adc.w r3, r5, r4 ; CHECK-NEXT: vmov.u8 r5, q3[8] ; CHECK-NEXT: vmov.16 q4[0], r5 ; CHECK-NEXT: vmov.u8 r5, q3[9] @@ -2710,37 +2586,33 @@ ; CHECK-NEXT: asrs r4, r4, #31 ; CHECK-NEXT: vmov q3[3], q3[1], r4, r2 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov r2, r4, d4 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, r2, d5 ; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: ubfx r4, r5, #12, #1 +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: ubfx r3, r5, #12, #1 ; CHECK-NEXT: ubfx r5, r5, #8, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r5, r5, #0 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r5, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r4 -; CHECK-NEXT: vmov.s8 r5, q0[11] -; CHECK-NEXT: vmov.s8 r4, q0[10] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov q2[2], q2[0], r5, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r5, r3 +; CHECK-NEXT: vmov.s8 r3, q0[11] +; CHECK-NEXT: vmov.s8 r5, q0[10] +; CHECK-NEXT: vmov q3[2], q3[0], r5, r3 +; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 +; CHECK-NEXT: vmov q3[3], q3[1], r5, r3 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r5, s9 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s11 -; CHECK-NEXT: adds.w r12, r2, r4 -; CHECK-NEXT: vmov.u16 r4, q1[4] -; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r3, r5, d4 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r5, r2 +; CHECK-NEXT: vmov r2, r4, d5 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: adc.w r3, r5, r4 ; CHECK-NEXT: vmov.u16 r5, q1[6] +; CHECK-NEXT: vmov.u16 r4, q1[4] ; CHECK-NEXT: vmov q2[2], q2[0], r4, r5 ; CHECK-NEXT: vmov.u16 r5, q1[7] ; CHECK-NEXT: vmov.u16 r4, q1[5] @@ -2760,37 +2632,33 @@ ; CHECK-NEXT: asrs r4, r4, #31 ; CHECK-NEXT: vmov q2[3], q2[1], r4, r2 ; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r2, r4, d2 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov r4, r2, d3 ; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: ubfx r4, r5, #12, #1 +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: ubfx r3, r5, #12, #1 ; CHECK-NEXT: ubfx r5, r5, #8, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 -; CHECK-NEXT: rsb.w r5, r5, #0 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r4 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r4 -; CHECK-NEXT: vmov.s8 r5, q0[15] -; CHECK-NEXT: vmov.s8 r4, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov q1[2], q1[0], r5, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 +; CHECK-NEXT: vmov.s8 r3, q0[15] +; CHECK-NEXT: vmov.s8 r5, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r5, r3 +; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: asrs r5, r5, #31 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r5, d0 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r2, r5 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -2805,6 +2673,8 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 @@ -2821,15 +2691,13 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: orr.w r12, r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: vmov r12, lr, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: orr.w r3, r3, lr ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: bx lr +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i8> %b, zeroinitializer %xx = zext <2 x i8> %x to <2 x i64> @@ -2867,14 +2735,12 @@ ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -2890,29 +2756,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r12, s5 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: cset r2, eq +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vmov r3, r2, d2 +; CHECK-NEXT: csetm r12, ne ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: orrs.w r3, r3, r12 -; CHECK-NEXT: cset r3, eq -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r12 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r12 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i64> %b, zeroinitializer Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll @@ -16,11 +16,9 @@ define arm_aapcs_vfpcc i32 @and_v4i32(<4 x i32> %x) { ; CHECK-LABEL: and_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr @@ -33,11 +31,9 @@ ; CHECK-LABEL: and_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr @@ -49,11 +45,9 @@ define arm_aapcs_vfpcc i16 @and_v4i16(<4 x i16> %x) { ; CHECK-LABEL: and_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 ; CHECK-NEXT: bx lr @@ -170,12 +164,10 @@ define arm_aapcs_vfpcc i64 @and_v2i64(<2 x i64> %x) { ; CHECK-LABEL: and_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: ands r0, r2 +; CHECK-NEXT: ands r1, r3 ; CHECK-NEXT: bx lr entry: %z = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %x) @@ -186,12 +178,10 @@ ; CHECK-LABEL: and_v4i64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ands r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: ands r0, r2 +; CHECK-NEXT: ands r1, r3 ; CHECK-NEXT: bx lr entry: %z = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %x) @@ -215,11 +205,9 @@ define arm_aapcs_vfpcc i32 @and_v4i32_acc(<4 x i32> %x, i32 %y) { ; CHECK-LABEL: and_v4i32_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 @@ -234,11 +222,9 @@ ; CHECK-LABEL: and_v8i32_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 @@ -252,11 +238,9 @@ define arm_aapcs_vfpcc i16 @and_v4i16_acc(<4 x i16> %x, i16 %y) { ; CHECK-LABEL: and_v4i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: ands r2, r3 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: ands r0, r1 @@ -388,15 +372,15 @@ define arm_aapcs_vfpcc i64 @and_v2i64_acc(<2 x i64> %x, i64 %y) { ; CHECK-LABEL: and_v2i64_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: ands r2, r3 -; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: ands r0, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ands r2, r3 +; CHECK-NEXT: and.w r2, lr, r12 ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} entry: %z = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %x) %r = and i64 %y, %z @@ -406,16 +390,16 @@ define arm_aapcs_vfpcc i64 @and_v4i64_acc(<4 x i64> %x, i64 %y) { ; CHECK-LABEL: and_v4i64_acc: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: ands r2, r3 -; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: ands r0, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ands r2, r3 +; CHECK-NEXT: and.w r2, lr, r12 ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} entry: %z = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %x) %r = and i64 %y, %z @@ -437,11 +421,9 @@ define arm_aapcs_vfpcc i32 @or_v4i32(<4 x i32> %x) { ; CHECK-LABEL: or_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr @@ -454,11 +436,9 @@ ; CHECK-LABEL: or_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr @@ -470,11 +450,9 @@ define arm_aapcs_vfpcc i16 @or_v4i16(<4 x i16> %x) { ; CHECK-LABEL: or_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: bx lr @@ -591,12 +569,10 @@ define arm_aapcs_vfpcc i64 @or_v2i64(<2 x i64> %x) { ; CHECK-LABEL: or_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: bx lr entry: %z = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %x) @@ -607,12 +583,10 @@ ; CHECK-LABEL: or_v4i64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: bx lr entry: %z = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %x) @@ -636,11 +610,9 @@ define arm_aapcs_vfpcc i32 @or_v4i32_acc(<4 x i32> %x, i32 %y) { ; CHECK-LABEL: or_v4i32_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 @@ -655,11 +627,9 @@ ; CHECK-LABEL: or_v8i32_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 @@ -673,11 +643,9 @@ define arm_aapcs_vfpcc i16 @or_v4i16_acc(<4 x i16> %x, i16 %y) { ; CHECK-LABEL: or_v4i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: orrs r2, r3 ; CHECK-NEXT: orrs r1, r2 ; CHECK-NEXT: orrs r0, r1 @@ -809,15 +777,15 @@ define arm_aapcs_vfpcc i64 @or_v2i64_acc(<2 x i64> %x, i64 %y) { ; CHECK-LABEL: or_v2i64_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: orrs r0, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: orr.w r2, lr, r12 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} entry: %z = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %x) %r = or i64 %y, %z @@ -827,16 +795,16 @@ define arm_aapcs_vfpcc i64 @or_v4i64_acc(<4 x i64> %x, i64 %y) { ; CHECK-LABEL: or_v4i64_acc: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: orrs r0, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: orrs r2, r3 +; CHECK-NEXT: orr.w r2, lr, r12 ; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} entry: %z = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %x) %r = or i64 %y, %z @@ -858,11 +826,9 @@ define arm_aapcs_vfpcc i32 @xor_v4i32(<4 x i32> %x) { ; CHECK-LABEL: xor_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr @@ -875,11 +841,9 @@ ; CHECK-LABEL: xor_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: veor q0, q0, q1 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr @@ -891,11 +855,9 @@ define arm_aapcs_vfpcc i16 @xor_v4i16(<4 x i16> %x) { ; CHECK-LABEL: xor_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr @@ -1012,12 +974,10 @@ define arm_aapcs_vfpcc i64 @xor_v2i64(<2 x i64> %x) { ; CHECK-LABEL: xor_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: eors r1, r3 ; CHECK-NEXT: bx lr entry: %z = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %x) @@ -1028,12 +988,10 @@ ; CHECK-LABEL: xor_v4i64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: veor q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: eors r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: eors r0, r2 +; CHECK-NEXT: eors r1, r3 ; CHECK-NEXT: bx lr entry: %z = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %x) @@ -1057,11 +1015,9 @@ define arm_aapcs_vfpcc i32 @xor_v4i32_acc(<4 x i32> %x, i32 %y) { ; CHECK-LABEL: xor_v4i32_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: eors r2, r3 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 @@ -1076,11 +1032,9 @@ ; CHECK-LABEL: xor_v8i32_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: veor q0, q0, q1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: eors r2, r3 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 @@ -1094,11 +1048,9 @@ define arm_aapcs_vfpcc i16 @xor_v4i16_acc(<4 x i16> %x, i16 %y) { ; CHECK-LABEL: xor_v4i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: eors r2, r3 ; CHECK-NEXT: eors r1, r2 ; CHECK-NEXT: eors r0, r1 @@ -1230,15 +1182,15 @@ define arm_aapcs_vfpcc i64 @xor_v2i64_acc(<2 x i64> %x, i64 %y) { ; CHECK-LABEL: xor_v2i64_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: eors r2, r3 -; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: eors r0, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: eors r2, r3 +; CHECK-NEXT: eor.w r2, lr, r12 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} entry: %z = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %x) %r = xor i64 %y, %z @@ -1248,16 +1200,16 @@ define arm_aapcs_vfpcc i64 @xor_v4i64_acc(<4 x i64> %x, i64 %y) { ; CHECK-LABEL: xor_v4i64_acc: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: veor q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: eors r2, r3 -; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: eors r0, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: eors r2, r3 +; CHECK-NEXT: eor.w r2, lr, r12 ; CHECK-NEXT: eors r1, r2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} entry: %z = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %x) %r = xor i64 %y, %z Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -96,8 +96,8 @@ define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-LABEL: mul_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB1_8 @@ -119,14 +119,12 @@ ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov lr, r3, d1 ; CHECK-NEXT: cmp r12, r1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: mul lr, r3, r2 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: mul r3, lr, r3 +; CHECK-NEXT: mul r2, r4, r2 ; CHECK-NEXT: mul r2, r3, r2 -; CHECK-NEXT: mul r2, r2, lr ; CHECK-NEXT: beq .LBB1_8 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r12 @@ -138,7 +136,7 @@ ; CHECK-NEXT: le lr, .LBB1_7 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup @@ -190,8 +188,8 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-LABEL: and_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -217,13 +215,11 @@ ; CHECK-NEXT: vand q0, q1, q0 ; CHECK-NEXT: le lr, .LBB2_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov r12, s3 +; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: and.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: and.w r2, r2, lr +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: and.w r12, r12, lr +; CHECK-NEXT: and.w r2, r2, r4 ; CHECK-NEXT: and.w r2, r2, r12 ; CHECK-NEXT: beq .LBB2_9 ; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1 @@ -236,7 +232,7 @@ ; CHECK-NEXT: le lr, .LBB2_8 ; CHECK-NEXT: .LBB2_9: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup @@ -288,8 +284,8 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-LABEL: or_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB3_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -315,13 +311,11 @@ ; CHECK-NEXT: vorr q0, q1, q0 ; CHECK-NEXT: le lr, .LBB3_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov r12, s3 +; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: orr.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: orr.w r2, r2, lr +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: orr.w r12, r12, lr +; CHECK-NEXT: orr.w r2, r2, r4 ; CHECK-NEXT: orr.w r2, r2, r12 ; CHECK-NEXT: beq .LBB3_9 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1 @@ -334,7 +328,7 @@ ; CHECK-NEXT: le lr, .LBB3_8 ; CHECK-NEXT: .LBB3_9: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup @@ -386,8 +380,8 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) { ; CHECK-LABEL: xor_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: blt .LBB4_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader @@ -413,13 +407,11 @@ ; CHECK-NEXT: veor q0, q1, q0 ; CHECK-NEXT: le lr, .LBB4_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmov r12, s3 +; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: cmp r3, r1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: eor.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: eor.w r2, r2, lr +; CHECK-NEXT: vmov r2, r4, d0 +; CHECK-NEXT: eor.w r12, r12, lr +; CHECK-NEXT: eor.w r2, r2, r4 ; CHECK-NEXT: eor.w r2, r2, r12 ; CHECK-NEXT: beq .LBB4_9 ; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1 @@ -432,7 +424,7 @@ ; CHECK-NEXT: le lr, .LBB4_8 ; CHECK-NEXT: .LBB4_9: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -42,12 +42,10 @@ ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmullb.u32 q2, q0, q1 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -61,12 +59,10 @@ ; CHECK-LABEL: add_v2i32_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmullb.s32 q2, q0, q1 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i32> %x to <2 x i64> @@ -247,12 +243,18 @@ ; CHECK-NEXT: vmov.i64 q2, #0xffff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umlal r0, r1, r3, r2 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -265,16 +267,22 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) { ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: sxth r1, r1 -; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i16> %x to <2 x i64> @@ -549,8 +557,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_v16i8_v16i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.u8 r0, q1[3] @@ -569,159 +577,161 @@ ; CHECK-NEXT: umull r0, r1, r1, r0 ; CHECK-NEXT: umull r2, r3, r3, r2 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.u8 r2, q1[1] ; CHECK-NEXT: vmov q3[3], q3[1], r3, r1 -; CHECK-NEXT: vmov.u8 r3, q1[1] -; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 +; CHECK-NEXT: vmov.u8 r3, q1[0] +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vmov.u8 r1, q0[0] ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 -; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov q5[2], q5[0], r1, r3 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vand q5, q5, q2 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: vmov lr, s12 -; CHECK-NEXT: vmov r12, s13 -; CHECK-NEXT: umull r0, r2, r2, r0 -; CHECK-NEXT: smlabb r0, r4, r3, r0 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov.u8 r4, q0[4] +; CHECK-NEXT: vmov r1, s20 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov lr, r12, d6 +; CHECK-NEXT: umull r1, r2, r1, r2 +; CHECK-NEXT: smlabb r0, r0, r3, r1 ; CHECK-NEXT: adds.w r0, r0, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q1[4] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adc.w r1, r2, r12 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[5] +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov.u8 r3, q1[4] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.u8 r0, q0[4] ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[6] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q1[6] +; CHECK-NEXT: vmov.u8 r0, q0[6] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u8 r2, q1[9] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q1[8] +; CHECK-NEXT: vmov.u8 r0, q0[8] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q1[10] +; CHECK-NEXT: vmov.u8 r0, q0[10] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u8 r2, q1[13] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q1[12] +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r12, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u8 r2, q1[15] ; CHECK-NEXT: vmov.u8 r3, q1[14] +; CHECK-NEXT: vmov.u8 r0, q0[14] ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r3 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umlal r0, r1, r3, r2 ; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: umull r1, r3, r1, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umlal r0, r1, r3, r2 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -743,27 +753,25 @@ ; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r1 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov.s8 r2, q1[3] +; CHECK-NEXT: adc.w lr, r3, r1 ; CHECK-NEXT: vmov.s8 r3, q0[3] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.s8 r1, q1[3] ; CHECK-NEXT: vmov.s8 r0, q1[2] -; CHECK-NEXT: vmov.s8 r2, q0[2] -; CHECK-NEXT: smull r1, r3, r3, r1 -; CHECK-NEXT: smull r0, r2, r2, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r2 +; CHECK-NEXT: vmov.s8 r1, q0[2] +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.s8 r2, q1[5] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.s8 r3, q0[5] ; CHECK-NEXT: vmov.s8 r0, q1[4] ; CHECK-NEXT: vmov.s8 r1, q0[4] @@ -771,14 +779,13 @@ ; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r2 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.s8 r2, q1[7] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.s8 r3, q0[7] ; CHECK-NEXT: vmov.s8 r0, q1[6] ; CHECK-NEXT: vmov.s8 r1, q0[6] @@ -786,14 +793,13 @@ ; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r2 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.s8 r2, q1[9] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.s8 r3, q0[9] ; CHECK-NEXT: vmov.s8 r0, q1[8] ; CHECK-NEXT: vmov.s8 r1, q0[8] @@ -801,14 +807,13 @@ ; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r2 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.s8 r2, q1[11] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.s8 r3, q0[11] ; CHECK-NEXT: vmov.s8 r0, q1[10] ; CHECK-NEXT: vmov.s8 r1, q0[10] @@ -816,14 +821,13 @@ ; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r2 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.s8 r2, q1[13] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.s8 r3, q0[13] ; CHECK-NEXT: vmov.s8 r0, q1[12] ; CHECK-NEXT: vmov.s8 r1, q0[12] @@ -831,19 +835,26 @@ ; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 ; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r2, r12, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vmov.s8 r2, q1[14] -; CHECK-NEXT: vmov.s8 r3, q0[14] -; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.s8 r2, q1[15] ; CHECK-NEXT: vmov.s8 r3, q0[15] -; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.s8 r0, q1[14] +; CHECK-NEXT: vmov.s8 r1, q0[14] +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> @@ -942,16 +953,22 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i8> %x to <2 x i64> @@ -964,25 +981,25 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: add_v2i64_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: umull r12, r2, r1, r0 -; CHECK-NEXT: mla r1, r1, r3, r2 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: mla lr, r2, r0, r1 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: umull r3, r1, r2, r0 -; CHECK-NEXT: mla r1, r2, r4, r1 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: mla r1, r2, r0, r1 -; CHECK-NEXT: adds.w r0, r12, r3 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: vmov r2, lr, d1 +; CHECK-NEXT: vmov r4, r9, d2 +; CHECK-NEXT: vmov r6, r7, d0 +; CHECK-NEXT: umull r1, r8, r2, r0 +; CHECK-NEXT: umull r3, r5, r6, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 +; CHECK-NEXT: mla r1, r2, r12, r8 +; CHECK-NEXT: mla r1, lr, r0, r1 +; CHECK-NEXT: mla r0, r6, r9, r5 +; CHECK-NEXT: mla r0, r7, r4, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %m = mul <2 x i64> %x, %y %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) @@ -1035,14 +1052,12 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmullb.u32 q2, q0, q1 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov lr, s9 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vmov r3, r2, d4 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -1059,14 +1074,12 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmullb.s32 q2, q0, q1 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov lr, s9 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vmov r3, r2, d4 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <2 x i32> %x to <2 x i64> @@ -1230,14 +1243,20 @@ ; CHECK-NEXT: vmov.i64 q2, #0xffff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s6 -; CHECK-NEXT: umull r2, lr, r3, r2 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umlal r2, lr, r3, r12 +; CHECK-NEXT: vmov lr, s4 +; CHECK-NEXT: umull r12, r3, r3, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: umull r2, lr, r2, lr +; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 +; CHECK-NEXT: vmov q0[3], q0[1], lr, r3 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmov r2, lr, d0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -1253,18 +1272,24 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: smull r2, r12, r3, r2 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: sxth.w lr, r3 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: smlal r2, r12, r3, lr +; CHECK-NEXT: smull r12, r3, r3, r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: sxth.w lr, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: smull r2, lr, r2, lr +; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 +; CHECK-NEXT: vmov q0[3], q0[1], lr, r3 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmov r2, lr, d0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <2 x i16> %x to <2 x i64> @@ -1462,8 +1487,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov.u8 r2, q1[3] @@ -1477,166 +1502,168 @@ ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r12, s14 ; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov.u8 r4, q1[0] +; CHECK-NEXT: vmov.u8 r4, q0[0] ; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov.u8 r5, q0[0] ; CHECK-NEXT: umull lr, r12, r2, r12 ; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: umull r2, r3, r2, r3 ; CHECK-NEXT: vmov q3[2], q3[0], r2, lr -; CHECK-NEXT: vmov.u8 r2, q1[1] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r2 -; CHECK-NEXT: vmov.u8 r4, q0[1] -; CHECK-NEXT: vmov q5[2], q5[0], r5, r4 +; CHECK-NEXT: vmov.u8 r2, q1[0] +; CHECK-NEXT: vmov q3[3], q3[1], r3, r12 +; CHECK-NEXT: vmov.u8 r3, q1[1] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov.u8 r3, q0[1] +; CHECK-NEXT: vmov q5[2], q5[0], r4, r3 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vand q5, q5, q2 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r12 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r6, s22 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov lr, s13 -; CHECK-NEXT: umull r2, r4, r4, r2 -; CHECK-NEXT: smlabb r2, r6, r5, r2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov.u8 r5, q1[4] -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r4, lr -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: adds.w lr, r2, r6 -; CHECK-NEXT: vmov.u8 r6, q1[5] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[5] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: vmov r5, s22 +; CHECK-NEXT: vmov lr, r12, d6 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: smlabb r2, r5, r4, r2 +; CHECK-NEXT: vmov r5, r4, d7 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q1[5] +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.u8 r4, q1[4] +; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q0[5] +; CHECK-NEXT: vmov.u8 r2, q0[4] ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 +; CHECK-NEXT: vmov r5, s14 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: umull r2, r5, r2, r5 +; CHECK-NEXT: umull r3, r4, r3, r4 +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[6] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds.w r6, r6, lr -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[7] -; CHECK-NEXT: adc.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[6] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[7] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov r5, r4, d7 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.u8 r5, q1[7] +; CHECK-NEXT: vmov.u8 r4, q1[6] +; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q0[7] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: umull r2, r5, r2, r5 +; CHECK-NEXT: umull r3, r4, r3, r4 +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[9] -; CHECK-NEXT: adc.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[8] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[9] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov r5, r4, d7 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.u8 r5, q1[9] +; CHECK-NEXT: vmov.u8 r4, q1[8] +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q0[9] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: umull r2, r5, r2, r5 +; CHECK-NEXT: umull r3, r4, r3, r4 +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[11] -; CHECK-NEXT: adc.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[10] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[11] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov r5, r4, d7 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.u8 r5, q1[11] +; CHECK-NEXT: vmov.u8 r4, q1[10] +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q0[11] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: umull r2, r5, r2, r5 +; CHECK-NEXT: umull r3, r4, r3, r4 +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[13] -; CHECK-NEXT: adc.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[12] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[13] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov r5, r4, d7 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.u8 r5, q1[13] +; CHECK-NEXT: vmov.u8 r4, q1[12] +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: umull r2, r5, r2, r5 +; CHECK-NEXT: umull r3, r4, r3, r4 +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[15] -; CHECK-NEXT: adcs r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[14] -; CHECK-NEXT: vmov q1[2], q1[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[15] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov r5, r4, d7 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.u8 r5, q1[15] +; CHECK-NEXT: vmov.u8 r4, q1[14] +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vmov q1[2], q1[0], r4, r5 +; CHECK-NEXT: vmov.u8 r4, q0[15] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: umlal r3, r2, r5, r6 -; CHECK-NEXT: vmov r6, s6 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: umlal r3, r2, r5, r6 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: umull r2, r5, r2, r5 +; CHECK-NEXT: umull r3, r4, r3, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -1653,48 +1680,45 @@ ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vmov.s8 r2, q1[1] ; CHECK-NEXT: vmov.s8 r3, q0[1] -; CHECK-NEXT: smull r12, r3, r3, r2 -; CHECK-NEXT: vmov.s8 lr, q1[0] +; CHECK-NEXT: smull lr, r12, r3, r2 +; CHECK-NEXT: vmov.s8 r3, q1[0] ; CHECK-NEXT: vmov.s8 r2, q0[0] ; CHECK-NEXT: vmov.s8 r4, q1[2] ; CHECK-NEXT: vmov.s8 r5, q0[2] -; CHECK-NEXT: smull r2, lr, r2, lr -; CHECK-NEXT: vmov q2[2], q2[0], r2, r12 +; CHECK-NEXT: smull r2, r3, r2, r3 +; CHECK-NEXT: vmov q2[2], q2[0], r2, lr ; CHECK-NEXT: smull r4, r5, r5, r4 -; CHECK-NEXT: vmov q2[3], q2[1], lr, r3 -; CHECK-NEXT: vmov lr, s10 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r12, s9 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.s8 r2, q1[3] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r3, r12 +; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vmov r3, r2, d4 +; CHECK-NEXT: adds.w lr, lr, r3 ; CHECK-NEXT: vmov.s8 r3, q0[3] +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov.s8 r2, q1[3] ; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 ; CHECK-NEXT: vmov q2[3], q2[1], r5, r3 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r5, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r5, r4 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov.s8 r5, q1[5] ; CHECK-NEXT: vmov.s8 r4, q0[5] -; CHECK-NEXT: adc.w r12, r2, r3 -; CHECK-NEXT: vmov.s8 r3, q1[5] ; CHECK-NEXT: vmov.s8 r2, q1[4] -; CHECK-NEXT: vmov.s8 r5, q0[4] -; CHECK-NEXT: smull r3, r4, r4, r3 -; CHECK-NEXT: smull r2, r5, r5, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r3, r5 +; CHECK-NEXT: vmov.s8 r3, q0[4] +; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 +; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 ; CHECK-NEXT: vmov.s8 r5, q1[7] -; CHECK-NEXT: adc.w r12, r2, r4 ; CHECK-NEXT: vmov.s8 r4, q0[7] ; CHECK-NEXT: vmov.s8 r2, q1[6] ; CHECK-NEXT: vmov.s8 r3, q0[6] @@ -1702,14 +1726,13 @@ ; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r3, r5 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 ; CHECK-NEXT: vmov.s8 r5, q1[9] -; CHECK-NEXT: adc.w r12, r2, r4 ; CHECK-NEXT: vmov.s8 r4, q0[9] ; CHECK-NEXT: vmov.s8 r2, q1[8] ; CHECK-NEXT: vmov.s8 r3, q0[8] @@ -1717,14 +1740,13 @@ ; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r3, r5 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 ; CHECK-NEXT: vmov.s8 r5, q1[11] -; CHECK-NEXT: adc.w r12, r2, r4 ; CHECK-NEXT: vmov.s8 r4, q0[11] ; CHECK-NEXT: vmov.s8 r2, q1[10] ; CHECK-NEXT: vmov.s8 r3, q0[10] @@ -1732,14 +1754,13 @@ ; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r3, r5 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 ; CHECK-NEXT: vmov.s8 r5, q1[13] -; CHECK-NEXT: adc.w r12, r2, r4 ; CHECK-NEXT: vmov.s8 r4, q0[13] ; CHECK-NEXT: vmov.s8 r2, q1[12] ; CHECK-NEXT: vmov.s8 r3, q0[12] @@ -1747,21 +1768,28 @@ ; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 ; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: vmov.s8 r5, q1[14] -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov.s8 r4, q0[14] -; CHECK-NEXT: smlal r3, r2, r4, r5 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 +; CHECK-NEXT: adc.w lr, r3, r4 ; CHECK-NEXT: vmov.s8 r5, q1[15] ; CHECK-NEXT: vmov.s8 r4, q0[15] -; CHECK-NEXT: smlal r3, r2, r4, r5 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov.s8 r2, q1[14] +; CHECK-NEXT: vmov.s8 r3, q0[14] +; CHECK-NEXT: smull r5, r4, r4, r5 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> @@ -1805,18 +1833,24 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r12, r3, r2 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: sxtb.w lr, r3 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smlal r2, r12, r3, lr +; CHECK-NEXT: smull r12, r3, r3, r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smull r2, lr, r2, lr +; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 +; CHECK-NEXT: vmov q0[3], q0[1], lr, r3 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmov r2, lr, d0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <2 x i8> %x to <2 x i64> @@ -1830,27 +1864,27 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, i64 %a) { ; CHECK-LABEL: add_v2i64_v2i64_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov r6, s7 -; CHECK-NEXT: umull lr, r12, r3, r2 -; CHECK-NEXT: mla r3, r3, r4, r12 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: mla r12, r4, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: umull r2, r5, r4, r3 -; CHECK-NEXT: mla r4, r4, r6, r5 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: mla r3, r5, r3, r4 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: vmov r2, r12, d3 +; CHECK-NEXT: vmov r3, lr, d1 +; CHECK-NEXT: vmov r6, r9, d2 +; CHECK-NEXT: vmov r5, r11, d0 +; CHECK-NEXT: umull r10, r8, r3, r2 +; CHECK-NEXT: umull r4, r7, r5, r6 +; CHECK-NEXT: mla r3, r3, r12, r8 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r10 +; CHECK-NEXT: mla r2, lr, r2, r3 +; CHECK-NEXT: mla r3, r5, r9, r7 +; CHECK-NEXT: mla r3, r11, r6, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r7, r6, d0 +; CHECK-NEXT: adds r3, r3, r7 +; CHECK-NEXT: adcs r2, r6 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %m = mul <2 x i64> %x, %y %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -64,12 +64,10 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: vand q0, q3, q0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -98,12 +96,10 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: vand q0, q3, q0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -348,12 +344,10 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -395,12 +389,10 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -836,18 +828,16 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 ; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds.w lr, r2, r1 -; CHECK-NEXT: vmov.u8 r1, q4[2] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r1, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w lr, r3, r1 ; CHECK-NEXT: ubfx r3, r0, #12, #1 ; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r0, r0, #0 +; CHECK-NEXT: vmov.u8 r1, q4[2] ; CHECK-NEXT: vmov q7[2], q7[0], r0, r3 +; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov q7[3], q7[1], r0, r3 ; CHECK-NEXT: vmov.u8 r0, q1[3] ; CHECK-NEXT: vmov.u8 r3, q1[2] @@ -866,17 +856,15 @@ ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 ; CHECK-NEXT: vand q0, q0, q7 ; CHECK-NEXT: vmov q7, q4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r2, r12, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: vmov.u8 r1, q4[4] -; CHECK-NEXT: adc.w lr, r2, r3 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u16 r2, q6[6] ; CHECK-NEXT: vmov.u16 r3, q6[4] +; CHECK-NEXT: vmov.u8 r1, q4[4] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q6[7] ; CHECK-NEXT: vmov.u16 r3, q6[5] @@ -905,23 +893,21 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r1 ; CHECK-NEXT: vand q0, q0, q6 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r3, lr, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r3, r4 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r3, r4, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r4, q4[6] +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: adcs r1, r4 ; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[7] ; CHECK-NEXT: vmov.u8 r3, q1[6] +; CHECK-NEXT: vmov.u8 r4, q4[6] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q4[7] ; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 @@ -937,14 +923,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 ; CHECK-NEXT: vand q0, q0, q6 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r0, r2, d0 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q5[8] ; CHECK-NEXT: vmov.16 q6[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[9] @@ -995,23 +979,21 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 ; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r3, lr, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r3, r4 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r3, r4, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r4, q7[10] +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: adcs r1, r4 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[11] ; CHECK-NEXT: vmov.u8 r3, q1[10] +; CHECK-NEXT: vmov.u8 r4, q7[10] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q7[11] ; CHECK-NEXT: vmov q5[2], q5[0], r4, r3 @@ -1026,13 +1008,11 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 ; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov r0, r2, d0 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q3[6] ; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u16 r3, q3[4] @@ -1065,23 +1045,21 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r3, lr, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r3, r4 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r3, r4, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r4, q7[14] +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: adcs r1, r4 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q1[15] ; CHECK-NEXT: vmov.u8 r3, q1[14] +; CHECK-NEXT: vmov.u8 r4, q7[14] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q7[15] ; CHECK-NEXT: vmov q1[2], q1[0], r4, r3 @@ -1096,12 +1074,10 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r0, r2, d0 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: add sp, #40 @@ -1170,11 +1146,9 @@ ; CHECK-NEXT: vmov q7[2], q7[0], r2, r1 ; CHECK-NEXT: vmov q7[3], q7[1], r3, r12 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: vmov r1, s24 -; CHECK-NEXT: vmov r12, s27 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: adds.w lr, r1, r3 +; CHECK-NEXT: vmov r1, r12, d13 +; CHECK-NEXT: vmov r3, r2, d12 +; CHECK-NEXT: adds.w lr, r3, r1 ; CHECK-NEXT: ubfx r3, r0, #12, #1 ; CHECK-NEXT: ubfx r0, r0, #8, #1 ; CHECK-NEXT: rsb.w r3, r3, #0 @@ -1191,21 +1165,19 @@ ; CHECK-NEXT: vmov q7[2], q7[0], r1, r0 ; CHECK-NEXT: vmov q7[3], q7[1], r2, r3 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r1, s24 -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r2, r12, r0 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: vmov.s8 r1, q1[4] -; CHECK-NEXT: adc.w lr, r2, r3 +; CHECK-NEXT: vmov r0, r1, d12 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u16 r2, q5[6] ; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: smull r1, r4, r4, r1 +; CHECK-NEXT: vmov.s8 r1, q1[4] ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: smull r1, r4, r4, r1 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vmrs r2, p0 @@ -1221,38 +1193,32 @@ ; CHECK-NEXT: vmov q6[2], q6[0], r1, r0 ; CHECK-NEXT: vmov q6[3], q6[1], r4, r3 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r0, r0, lr -; CHECK-NEXT: adds r1, r1, r4 -; CHECK-NEXT: vmov.s8 r4, q1[6] -; CHECK-NEXT: adc.w r12, r0, r3 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov r3, r4, d11 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.s8 r0, q0[6] +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 -; CHECK-NEXT: smull r0, r4, r0, r4 +; CHECK-NEXT: adcs r1, r4 ; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 ; CHECK-NEXT: vmov.s8 r2, q1[7] ; CHECK-NEXT: vmov.s8 r3, q0[7] +; CHECK-NEXT: vmov.s8 r4, q1[6] +; CHECK-NEXT: vmov.s8 r0, q0[6] ; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: smull r0, r4, r0, r4 ; CHECK-NEXT: vmov q6[2], q6[0], r0, r2 ; CHECK-NEXT: vmov q6[3], q6[1], r4, r3 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r12, r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: vmov.s8 r0, q1[8] -; CHECK-NEXT: adc.w lr, r2, r3 +; CHECK-NEXT: vmov r0, r2, d10 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -1269,15 +1235,17 @@ ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[15] ; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.s8 r1, q0[8] +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vcmp.i16 ne, q5, zr -; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.s8 r0, q1[8] ; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.s8 r1, q0[8] ; CHECK-NEXT: vmov.u16 r2, q2[2] ; CHECK-NEXT: vmov.u16 r3, q2[0] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q2[3] ; CHECK-NEXT: vmov.u16 r3, q2[1] +; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 @@ -1293,47 +1261,43 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 ; CHECK-NEXT: vmov q4[3], q4[1], r1, r4 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r0, r0, lr -; CHECK-NEXT: adds r1, r1, r4 -; CHECK-NEXT: vmov.s8 r4, q1[10] -; CHECK-NEXT: adc.w r12, r0, r3 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov r3, r4, d7 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.s8 r0, q0[10] +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: smull r0, r4, r0, r4 +; CHECK-NEXT: adcs r1, r4 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.s8 r2, q1[11] ; CHECK-NEXT: vmov.s8 r3, q0[11] +; CHECK-NEXT: vmov.s8 r4, q1[10] +; CHECK-NEXT: vmov.s8 r0, q0[10] ; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: smull r0, r4, r0, r4 ; CHECK-NEXT: vmov q4[2], q4[0], r0, r2 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r3 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r12, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: vmov.s8 r0, q1[12] -; CHECK-NEXT: adc.w lr, r2, r3 +; CHECK-NEXT: vmov r0, r2, d6 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: vmov.s8 r1, q0[12] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.s8 r0, q1[12] ; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov.s8 r1, q0[12] ; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r4, r2, #1 ; CHECK-NEXT: ubfx r3, r2, #4, #1 @@ -1347,37 +1311,33 @@ ; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 ; CHECK-NEXT: vmov q3[3], q3[1], r1, r4 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w r1, r1, r12 -; CHECK-NEXT: adc.w r0, r0, lr -; CHECK-NEXT: adds r1, r1, r4 -; CHECK-NEXT: vmov.s8 r4, q1[14] -; CHECK-NEXT: adc.w r12, r0, r3 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r3, r4, d5 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.s8 r0, q0[14] +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: smull r0, r4, r0, r4 +; CHECK-NEXT: adcs r1, r4 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 ; CHECK-NEXT: vmov.s8 r2, q1[15] ; CHECK-NEXT: vmov.s8 r3, q0[15] +; CHECK-NEXT: vmov.s8 r4, q1[14] +; CHECK-NEXT: vmov.s8 r0, q0[14] ; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: smull r0, r4, r0, r4 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r3 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r12, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: vmov r0, r2, d0 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, pc} entry: @@ -1498,12 +1458,10 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -1545,12 +1503,10 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -1565,29 +1521,23 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) { ; CHECK-LABEL: add_v2i64_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: umull lr, r12, r1, r0 -; CHECK-NEXT: umull r4, r5, r2, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r4, lr -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: mla r1, r1, r4, r12 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: mla r0, r4, r0, r1 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: mla r1, r2, r1, r5 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: mla r1, r2, r3, r1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: vmov r0, r12, d3 +; CHECK-NEXT: vmov r2, lr, d1 +; CHECK-NEXT: vmov r4, r9, d2 +; CHECK-NEXT: vmov r6, r7, d0 +; CHECK-NEXT: umull r1, r8, r2, r0 +; CHECK-NEXT: umull r3, r5, r6, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 +; CHECK-NEXT: mla r1, r2, r12, r8 +; CHECK-NEXT: mla r0, lr, r0, r1 +; CHECK-NEXT: mla r1, r6, r9, r5 +; CHECK-NEXT: mla r1, r7, r4, r1 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov r0, r1, d5 ; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne @@ -1595,16 +1545,14 @@ ; CHECK-NEXT: cset r1, eq ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vand q0, q3, q0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %c = icmp eq <2 x i64> %b, zeroinitializer %m = mul <2 x i64> %x, %y @@ -1681,14 +1629,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vand q0, q3, q0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -1720,14 +1666,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vand q0, q3, q0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -1947,14 +1891,12 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -1999,14 +1941,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -2311,18 +2251,16 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r12 ; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: adds r6, r2, r3 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: adds r6, r3, r2 ; CHECK-NEXT: ubfx r2, lr, #12, #1 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: vmov.u8 r3, q4[2] ; CHECK-NEXT: adc.w r12, r12, r4 ; CHECK-NEXT: ubfx r4, lr, #8, #1 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: vmov q7[2], q7[0], r4, r2 +; CHECK-NEXT: vmov.u8 r3, q4[2] ; CHECK-NEXT: vmov q7[3], q7[1], r4, r2 ; CHECK-NEXT: vmov.u8 r2, q1[3] ; CHECK-NEXT: vmov.u8 r4, q1[2] @@ -2342,16 +2280,14 @@ ; CHECK-NEXT: vmov.u8 r4, q4[4] ; CHECK-NEXT: vand q0, q0, q7 ; CHECK-NEXT: vmov q7, q4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: adc.w lr, r2, r6 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r12, r2, r6 ; CHECK-NEXT: vmov.u16 r2, q6[6] ; CHECK-NEXT: vmov.u16 r6, q6[4] +; CHECK-NEXT: adc.w lr, r3, r5 ; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 ; CHECK-NEXT: vmov.u16 r2, q6[7] ; CHECK-NEXT: vmov.u16 r6, q6[5] @@ -2380,20 +2316,18 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r6 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vand q0, q0, q6 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds.w r6, r12, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r5, lr, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc.w r6, lr, r4 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: vmov.u8 r5, q4[6] +; CHECK-NEXT: adc.w r12, r6, r4 ; CHECK-NEXT: ubfx r6, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r6, r6, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r12, r5, r4 +; CHECK-NEXT: rsbs r6, r6, #0 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r6 -; CHECK-NEXT: vmov.u8 r5, q4[6] ; CHECK-NEXT: vmov q6[3], q6[1], r2, r6 ; CHECK-NEXT: vmov.u8 r2, q1[7] ; CHECK-NEXT: vmov.u8 r6, q1[6] @@ -2413,15 +2347,12 @@ ; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 ; CHECK-NEXT: vmov.u8 r4, q7[8] ; CHECK-NEXT: vand q0, q0, q6 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: vmov r2, r6, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r6 +; CHECK-NEXT: vmov r6, r5, d1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: adc.w lr, r2, r6 +; CHECK-NEXT: adds.w r12, r2, r6 ; CHECK-NEXT: vmov.u8 r2, q5[8] ; CHECK-NEXT: vmov.16 q6[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[9] @@ -2438,6 +2369,7 @@ ; CHECK-NEXT: vmov.16 q6[6], r2 ; CHECK-NEXT: vmov.u8 r2, q5[15] ; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: adc.w lr, r3, r5 ; CHECK-NEXT: vcmp.i16 ne, q6, zr ; CHECK-NEXT: vpsel q3, q3, q0 ; CHECK-NEXT: vmov.u16 r2, q3[2] @@ -2470,20 +2402,18 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r6 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds.w r6, r12, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r5, lr, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc.w r6, lr, r4 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: vmov.u8 r5, q7[10] +; CHECK-NEXT: adc.w r12, r6, r4 ; CHECK-NEXT: ubfx r6, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r6, r6, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r12, r5, r4 +; CHECK-NEXT: rsbs r6, r6, #0 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r6 -; CHECK-NEXT: vmov.u8 r5, q7[10] ; CHECK-NEXT: vmov q4[3], q4[1], r2, r6 ; CHECK-NEXT: vmov.u8 r2, q1[11] ; CHECK-NEXT: vmov.u8 r6, q1[10] @@ -2502,16 +2432,14 @@ ; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 ; CHECK-NEXT: vmov.u8 r4, q7[12] ; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: adc.w lr, r2, r6 +; CHECK-NEXT: vmov r2, r6, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds.w r12, r2, r6 ; CHECK-NEXT: vmov.u16 r2, q3[6] ; CHECK-NEXT: vmov.u16 r6, q3[4] +; CHECK-NEXT: adc.w lr, r3, r5 ; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 ; CHECK-NEXT: vmov.u16 r2, q3[7] ; CHECK-NEXT: vmov.u16 r6, q3[5] @@ -2540,20 +2468,18 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r6 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds.w r6, r12, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r5, lr, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc.w r6, lr, r4 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: vmov.u8 r5, q7[14] +; CHECK-NEXT: adc.w r12, r6, r4 ; CHECK-NEXT: ubfx r6, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r6, r6, #0 -; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r12, r5, r4 +; CHECK-NEXT: rsbs r6, r6, #0 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov.u8 r5, q7[14] ; CHECK-NEXT: vmov q3[3], q3[1], r2, r6 ; CHECK-NEXT: vmov.u8 r2, q1[15] ; CHECK-NEXT: vmov.u8 r6, q1[14] @@ -2571,16 +2497,14 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r2, r6 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r6, d0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -2648,11 +2572,9 @@ ; CHECK-NEXT: vmov q7[2], q7[0], r3, r2 ; CHECK-NEXT: vmov q7[3], q7[1], r4, lr ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r4, s26 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov lr, s27 -; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: adds r6, r2, r4 +; CHECK-NEXT: vmov r2, lr, d13 +; CHECK-NEXT: vmov r4, r3, d12 +; CHECK-NEXT: adds r6, r4, r2 ; CHECK-NEXT: ubfx r4, r12, #12, #1 ; CHECK-NEXT: ubfx r2, r12, #8, #1 ; CHECK-NEXT: rsb.w r4, r4, #0 @@ -2668,24 +2590,22 @@ ; CHECK-NEXT: vmov q7[2], q7[0], r3, r2 ; CHECK-NEXT: vmov q7[3], q7[1], r5, r4 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov r5, s27 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: adc.w r6, lr, r2 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: vmov.s8 r2, q1[4] -; CHECK-NEXT: adc.w lr, r6, r5 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: vmov r6, r5, d13 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r6 ; CHECK-NEXT: vmov.u16 r6, q5[6] +; CHECK-NEXT: adc.w lr, r3, r5 ; CHECK-NEXT: vmov.u16 r5, q5[4] -; CHECK-NEXT: vmov.s8 r3, q0[4] ; CHECK-NEXT: vmov q6[2], q6[0], r5, r6 ; CHECK-NEXT: vmov.u16 r6, q5[7] ; CHECK-NEXT: vmov.u16 r5, q5[5] -; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.s8 r2, q1[4] ; CHECK-NEXT: vmov q6[3], q6[1], r5, r6 +; CHECK-NEXT: vmov.s8 r3, q0[4] ; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmrs r6, p0 ; CHECK-NEXT: and r4, r6, #1 ; CHECK-NEXT: ubfx r5, r6, #4, #1 @@ -2699,38 +2619,32 @@ ; CHECK-NEXT: vmov q6[2], q6[0], r2, r5 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r4 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: vmov r5, s23 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r2, r2, lr -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov.s8 r4, q1[6] -; CHECK-NEXT: adc.w r12, r2, r5 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov r5, r4, d11 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 ; CHECK-NEXT: ubfx r5, r6, #12, #1 ; CHECK-NEXT: ubfx r6, r6, #8, #1 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: vmov.s8 r2, q0[6] +; CHECK-NEXT: rsb.w r5, r5, #0 +; CHECK-NEXT: rsb.w r6, r6, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r6, r5 -; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: vmov q5[3], q5[1], r6, r5 ; CHECK-NEXT: vmov.s8 r6, q1[7] ; CHECK-NEXT: vmov.s8 r5, q0[7] +; CHECK-NEXT: vmov.s8 r4, q1[6] +; CHECK-NEXT: vmov.s8 r2, q0[6] ; CHECK-NEXT: smull r6, r5, r5, r6 +; CHECK-NEXT: smull r2, r4, r2, r4 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r6 ; CHECK-NEXT: vmov q6[3], q6[1], r4, r5 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r6, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov r5, s23 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: adc.w r6, r12, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: vmov.s8 r2, q1[8] -; CHECK-NEXT: adc.w lr, r6, r5 +; CHECK-NEXT: vmov r2, r6, d10 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d11 +; CHECK-NEXT: adds.w r12, r2, r6 ; CHECK-NEXT: vmov.u8 r6, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r6 ; CHECK-NEXT: vmov.u8 r6, q4[9] @@ -2747,15 +2661,17 @@ ; CHECK-NEXT: vmov.16 q5[6], r6 ; CHECK-NEXT: vmov.u8 r6, q4[15] ; CHECK-NEXT: vmov.16 q5[7], r6 -; CHECK-NEXT: vmov.s8 r3, q0[8] +; CHECK-NEXT: adc.w lr, r3, r5 ; CHECK-NEXT: vcmp.i16 ne, q5, zr -; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.s8 r2, q1[8] ; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.s8 r3, q0[8] ; CHECK-NEXT: vmov.u16 r6, q2[2] ; CHECK-NEXT: vmov.u16 r5, q2[0] ; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 ; CHECK-NEXT: vmov.u16 r6, q2[3] ; CHECK-NEXT: vmov.u16 r5, q2[1] +; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov q3[3], q3[1], r5, r6 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r6, p0 @@ -2771,47 +2687,43 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r2, r5 ; CHECK-NEXT: vmov q4[3], q4[1], r3, r4 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r2, r2, lr -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov.s8 r4, q1[10] -; CHECK-NEXT: adc.w r12, r2, r5 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov r5, r4, d7 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 ; CHECK-NEXT: ubfx r5, r6, #12, #1 ; CHECK-NEXT: ubfx r6, r6, #8, #1 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: vmov.s8 r2, q0[10] +; CHECK-NEXT: rsb.w r5, r5, #0 +; CHECK-NEXT: rsb.w r6, r6, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r6, r5 -; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: vmov q3[3], q3[1], r6, r5 ; CHECK-NEXT: vmov.s8 r6, q1[11] ; CHECK-NEXT: vmov.s8 r5, q0[11] +; CHECK-NEXT: vmov.s8 r4, q1[10] +; CHECK-NEXT: vmov.s8 r2, q0[10] ; CHECK-NEXT: smull r6, r5, r5, r6 +; CHECK-NEXT: smull r2, r4, r2, r4 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r6 ; CHECK-NEXT: vmov q4[3], q4[1], r4, r5 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: adc.w r6, r12, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: vmov.s8 r2, q1[12] -; CHECK-NEXT: adc.w lr, r6, r5 +; CHECK-NEXT: vmov r2, r6, d6 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d7 +; CHECK-NEXT: adds.w r12, r2, r6 ; CHECK-NEXT: vmov.u16 r6, q2[6] +; CHECK-NEXT: adc.w lr, r3, r5 ; CHECK-NEXT: vmov.u16 r5, q2[4] -; CHECK-NEXT: vmov.s8 r3, q0[12] ; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 ; CHECK-NEXT: vmov.u16 r6, q2[7] ; CHECK-NEXT: vmov.u16 r5, q2[5] -; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.s8 r2, q1[12] ; CHECK-NEXT: vmov q3[3], q3[1], r5, r6 +; CHECK-NEXT: vmov.s8 r3, q0[12] ; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmrs r6, p0 ; CHECK-NEXT: and r4, r6, #1 ; CHECK-NEXT: ubfx r5, r6, #4, #1 @@ -2825,39 +2737,35 @@ ; CHECK-NEXT: vmov q3[2], q3[0], r2, r5 ; CHECK-NEXT: vmov q3[3], q3[1], r3, r4 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov r5, s11 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r2, r2, lr -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov.s8 r4, q1[14] -; CHECK-NEXT: adc.w r12, r2, r5 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds.w r12, r2, r5 ; CHECK-NEXT: ubfx r5, r6, #12, #1 ; CHECK-NEXT: ubfx r6, r6, #8, #1 -; CHECK-NEXT: rsbs r5, r5, #0 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: vmov.s8 r2, q0[14] +; CHECK-NEXT: rsb.w r5, r5, #0 +; CHECK-NEXT: rsb.w r6, r6, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r6, r5 -; CHECK-NEXT: smull r2, r4, r2, r4 +; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: vmov q2[3], q2[1], r6, r5 ; CHECK-NEXT: vmov.s8 r6, q1[15] ; CHECK-NEXT: vmov.s8 r5, q0[15] +; CHECK-NEXT: vmov.s8 r4, q1[14] +; CHECK-NEXT: vmov.s8 r2, q0[14] ; CHECK-NEXT: smull r6, r5, r5, r6 +; CHECK-NEXT: smull r2, r4, r2, r4 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r6 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r2, r6 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, r6, d0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: @@ -2901,14 +2809,12 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -2953,14 +2859,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -2976,29 +2880,23 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) { ; CHECK-LABEL: add_v2i64_v2i64_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: umull r12, lr, r3, r2 -; CHECK-NEXT: umull r6, r7, r5, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r6, r12 -; CHECK-NEXT: vmov r6, s7 -; CHECK-NEXT: mla r3, r3, r6, lr -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: mla r2, r6, r2, r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: mla r3, r5, r3, r7 -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: mla r3, r7, r4, r3 -; CHECK-NEXT: vmov r7, s8 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: vmov r2, r12, d3 +; CHECK-NEXT: vmov r3, lr, d1 +; CHECK-NEXT: vmov r6, r9, d2 +; CHECK-NEXT: vmov r5, r11, d0 +; CHECK-NEXT: umull r10, r8, r3, r2 +; CHECK-NEXT: umull r4, r7, r5, r6 +; CHECK-NEXT: mla r3, r3, r12, r8 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r10 +; CHECK-NEXT: mla r2, lr, r2, r3 +; CHECK-NEXT: mla r3, r5, r9, r7 +; CHECK-NEXT: mla r3, r11, r6, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d5 ; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov r3, r7, d4 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csetm r2, ne @@ -3006,18 +2904,16 @@ ; CHECK-NEXT: cset r3, eq ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vand q0, q3, q0 -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds r7, r7, r6 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: adds r0, r0, r7 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r7, r6, d0 +; CHECK-NEXT: adds r2, r2, r7 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %c = icmp eq <2 x i64> %b, zeroinitializer %m = mul <2 x i64> %x, %y Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll @@ -16,11 +16,9 @@ define arm_aapcs_vfpcc i32 @mul_v4i32(<4 x i32> %x) { ; CHECK-LABEL: mul_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr @@ -33,11 +31,9 @@ ; CHECK-LABEL: mul_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.i32 q0, q0, q1 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr @@ -49,11 +45,9 @@ define arm_aapcs_vfpcc i16 @mul_v4i16(<4 x i16> %x) { ; CHECK-LABEL: mul_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: bx lr @@ -170,14 +164,14 @@ define arm_aapcs_vfpcc i64 @mul_v2i64(<2 x i64> %x) { ; CHECK-LABEL: mul_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: umull r0, r12, r2, r1 -; CHECK-NEXT: mla r2, r2, r3, r12 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: mla r1, r3, r1, r2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r1, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: umull r0, r2, r3, r1 +; CHECK-NEXT: mla r2, r3, r12, r2 +; CHECK-NEXT: mla r1, lr, r1, r2 +; CHECK-NEXT: pop {r7, pc} entry: %z = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %x) ret i64 %z @@ -186,26 +180,22 @@ define arm_aapcs_vfpcc i64 @mul_v4i64(<4 x i64> %x) { ; CHECK-LABEL: mul_v4i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vmov lr, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov r6, s6 -; CHECK-NEXT: vmov r5, s7 -; CHECK-NEXT: umull r3, r12, r2, lr -; CHECK-NEXT: umull r4, r8, r3, r1 -; CHECK-NEXT: umull r0, r7, r4, r6 -; CHECK-NEXT: mla r4, r4, r5, r7 -; CHECK-NEXT: vmov r5, s5 -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: mla r3, r3, r5, r8 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: mla r2, r2, r5, r12 -; CHECK-NEXT: mla r2, r7, lr, r2 -; CHECK-NEXT: mla r1, r2, r1, r3 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: vmov r1, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: vmov r5, r9, d2 +; CHECK-NEXT: vmov r6, r11, d3 +; CHECK-NEXT: umull r2, r8, r3, r1 +; CHECK-NEXT: mla r3, r3, r12, r8 +; CHECK-NEXT: umull r7, r10, r2, r5 +; CHECK-NEXT: mla r1, lr, r1, r3 +; CHECK-NEXT: mla r2, r2, r9, r10 +; CHECK-NEXT: umull r0, r4, r7, r6 +; CHECK-NEXT: mla r1, r1, r5, r2 +; CHECK-NEXT: mla r4, r7, r11, r4 ; CHECK-NEXT: mla r1, r1, r6, r4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %z = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %x) ret i64 %z @@ -228,11 +218,9 @@ define arm_aapcs_vfpcc i32 @mul_v4i32_acc(<4 x i32> %x, i32 %y) { ; CHECK-LABEL: mul_v4i32_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 @@ -247,11 +235,9 @@ ; CHECK-LABEL: mul_v8i32_acc: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.i32 q0, q0, q1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 @@ -265,11 +251,9 @@ define arm_aapcs_vfpcc i16 @mul_v4i16_acc(<4 x i16> %x, i16 %y) { ; CHECK-LABEL: mul_v4i16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: muls r0, r1, r0 @@ -405,20 +389,18 @@ define arm_aapcs_vfpcc i64 @mul_v2i64_acc(<2 x i64> %x, i64 %y) { ; CHECK-LABEL: mul_v2i64_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: umull r12, lr, r3, r2 -; CHECK-NEXT: mla r3, r3, r4, lr -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: mla r3, r4, r2, r3 -; CHECK-NEXT: umull r2, r4, r0, r12 -; CHECK-NEXT: mla r0, r0, r3, r4 -; CHECK-NEXT: mla r1, r1, r12, r0 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: umull r4, r5, r3, r2 +; CHECK-NEXT: mla r3, r3, r12, r5 +; CHECK-NEXT: mla r3, lr, r2, r3 +; CHECK-NEXT: umull r2, r5, r0, r4 +; CHECK-NEXT: mla r0, r0, r3, r5 +; CHECK-NEXT: mla r1, r1, r4, r0 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %z = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %x) %r = mul i64 %y, %z @@ -428,30 +410,36 @@ define arm_aapcs_vfpcc i64 @mul_v4i64_acc(<4 x i64> %x, i64 %y) { ; CHECK-LABEL: mul_v4i64_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r7, s6 -; CHECK-NEXT: vmov r6, s7 -; CHECK-NEXT: umull r2, lr, r3, r12 -; CHECK-NEXT: umull r5, r8, r2, r4 -; CHECK-NEXT: umull r10, r9, r5, r7 -; CHECK-NEXT: mla r5, r5, r6, r9 -; CHECK-NEXT: vmov r6, s5 -; CHECK-NEXT: mla r2, r2, r6, r8 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: mla r3, r3, r6, lr -; CHECK-NEXT: vmov r6, s1 -; CHECK-NEXT: mla r3, r6, r12, r3 -; CHECK-NEXT: mla r2, r3, r4, r2 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #12 +; CHECK-NEXT: sub sp, #12 +; CHECK-NEXT: mov lr, r0 +; CHECK-NEXT: vmov r2, r0, d1 +; CHECK-NEXT: vmov r6, r9, d2 +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: vmov r7, r11, d3 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vmov r3, r0, d0 +; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: umull r4, r8, r3, r2 +; CHECK-NEXT: mla r3, r3, r1, r8 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: umull r5, r10, r4, r6 +; CHECK-NEXT: mla r2, r1, r2, r3 +; CHECK-NEXT: mla r4, r4, r9, r10 +; CHECK-NEXT: umull r0, r12, r5, r7 +; CHECK-NEXT: mla r2, r2, r6, r4 +; CHECK-NEXT: mla r5, r5, r11, r12 ; CHECK-NEXT: mla r3, r2, r7, r5 -; CHECK-NEXT: umull r2, r7, r0, r10 -; CHECK-NEXT: mla r0, r0, r3, r7 -; CHECK-NEXT: mla r1, r1, r10, r0 +; CHECK-NEXT: umull r2, r7, lr, r0 +; CHECK-NEXT: mla r1, lr, r3, r7 +; CHECK-NEXT: ldr r3, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mla r1, r3, r0, r1 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: add sp, #12 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %z = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %x) %r = mul i64 %y, %z Index: llvm/test/CodeGen/Thumb2/mve-vhadd.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vhadd.ll +++ llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -260,42 +260,34 @@ ; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vand q2, q2, q4 ; CHECK-NEXT: vand q3, q3, q4 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r1, s13 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d6 ; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: vand q1, q1, q4 ; CHECK-NEXT: vand q4, q0, q4 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: adds r0, r2, #1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adds r0, #1 ; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov r3, r2, d8 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vmov r1, r12, d2 ; CHECK-NEXT: adds r1, r1, r3 ; CHECK-NEXT: adc.w r3, r2, r12 ; CHECK-NEXT: adds r2, r1, #1 ; CHECK-NEXT: adc r1, r3, #0 -; CHECK-NEXT: vmov r3, s14 ; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov r12, s7 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: adds r0, r2, #1 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adds r0, #1 ; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov r3, r2, d9 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s18 +; CHECK-NEXT: vmov r1, r12, d3 ; CHECK-NEXT: adds r1, r1, r3 ; CHECK-NEXT: adc.w r3, r2, r12 ; CHECK-NEXT: adds r2, r1, #1 @@ -369,40 +361,32 @@ ; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vand q2, q2, q4 ; CHECK-NEXT: vand q3, q3, q4 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d6 ; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: vand q1, q1, q4 ; CHECK-NEXT: vand q4, q0, q4 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r2, d8 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: adds r4, r1, r3 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r1, r12, d2 +; CHECK-NEXT: adds r4, r3, r1 ; CHECK-NEXT: adc.w r1, r2, r12 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r2, r3, d7 ; CHECK-NEXT: lsrl r4, r1, #1 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, r4, d9 +; CHECK-NEXT: lsrl r0, r1, #1 +; CHECK-NEXT: vmov r1, r12, d3 +; CHECK-NEXT: adds r2, r3, r1 +; CHECK-NEXT: adc.w r1, r4, r12 ; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov r1, s19 -; CHECK-NEXT: adds r4, r4, r3 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: lsrl r4, r1, #1 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, pc} %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> @@ -698,41 +682,33 @@ ; CHECK-NEXT: vmov.f32 s10, s19 ; CHECK-NEXT: vand q1, q1, q0 ; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: vmov r6, s5 -; CHECK-NEXT: vmov r4, s9 +; CHECK-NEXT: vmov r3, r5, d2 +; CHECK-NEXT: vmov r4, r6, d4 ; CHECK-NEXT: vmov.f32 s14, s13 ; CHECK-NEXT: vmov.f32 s18, s17 ; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vand q4, q4, q0 -; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: adds.w r12, r5, r3 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: adc.w r3, r4, r6 -; CHECK-NEXT: vmov r6, s16 +; CHECK-NEXT: vand q5, q4, q0 +; CHECK-NEXT: adds.w r12, r4, r3 +; CHECK-NEXT: adc.w r3, r6, r5 +; CHECK-NEXT: vmov r5, r6, d10 ; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds r6, r6, r5 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q5[2], q5[0], r6, r12 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vmov r6, s11 -; CHECK-NEXT: vmov r12, s15 -; CHECK-NEXT: adds r4, r4, r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r3, r7, d6 +; CHECK-NEXT: adds r4, r5, r3 +; CHECK-NEXT: adc.w r3, r6, r7 +; CHECK-NEXT: vmov r6, r5, d5 ; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: adds r6, r3, r5 -; CHECK-NEXT: adc.w r3, r7, r12 +; CHECK-NEXT: vmov r3, r7, d3 +; CHECK-NEXT: vmov q4[2], q4[0], r4, r12 +; CHECK-NEXT: adds r6, r6, r3 +; CHECK-NEXT: adc.w r3, r5, r7 +; CHECK-NEXT: vmov r5, r7, d11 ; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q5[3], q5[1], r6, r4 -; CHECK-NEXT: vstrb.8 q5, [r2], #16 +; CHECK-NEXT: vmov r3, r12, d7 +; CHECK-NEXT: adds r4, r5, r3 +; CHECK-NEXT: adc.w r3, r7, r12 +; CHECK-NEXT: lsrl r4, r3, #1 +; CHECK-NEXT: vmov q4[3], q4[1], r4, r6 +; CHECK-NEXT: vstrb.8 q4, [r2], #16 ; CHECK-NEXT: le lr, .LBB17_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9, d10, d11} @@ -888,49 +864,41 @@ ; CHECK-NEXT: vmov.f32 s10, s19 ; CHECK-NEXT: vand q1, q1, q0 ; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov r3, r12, d2 +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: vmov.f32 s14, s13 ; CHECK-NEXT: vmov.f32 s18, s17 ; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vand q4, q4, q0 -; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: adds r4, r4, r5 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds.w r12, r4, #1 -; CHECK-NEXT: adc r3, r3, #0 -; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vand q5, q4, q0 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adc.w r4, r5, r12 +; CHECK-NEXT: adds.w r12, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r6, d10 ; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s11 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: adds r4, r5, #1 -; CHECK-NEXT: adc r3, r3, #0 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov q5[2], q5[0], r4, r12 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s19 -; CHECK-NEXT: adcs r3, r6 -; CHECK-NEXT: adds.w r12, r5, #1 -; CHECK-NEXT: adc r3, r3, #0 -; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov r3, r4, d6 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds r6, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: lsrl r6, r3, #1 +; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 +; CHECK-NEXT: vmov r3, r6, d3 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds.w r12, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r6, d11 ; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r6, s18 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: adds r4, r5, #1 -; CHECK-NEXT: adc r3, r3, #0 -; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r12 -; CHECK-NEXT: vstrb.8 q5, [r2], #16 +; CHECK-NEXT: vmov r3, r4, d7 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds r6, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: lsrl r6, r3, #1 +; CHECK-NEXT: vmov q4[3], q4[1], r6, r12 +; CHECK-NEXT: vstrb.8 q4, [r2], #16 ; CHECK-NEXT: le lr, .LBB20_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9, d10, d11} @@ -1086,49 +1054,41 @@ ; CHECK-NEXT: vmov.f32 s10, s19 ; CHECK-NEXT: vand q1, q1, q0 ; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov r3, r12, d2 +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: vmov.f32 s14, s13 ; CHECK-NEXT: vmov.f32 s18, s17 ; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vand q4, q4, q0 -; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: adds r4, r4, r5 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds.w r12, r4, #1 -; CHECK-NEXT: adc r3, r3, #0 -; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vand q5, q4, q0 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adc.w r4, r5, r12 +; CHECK-NEXT: adds.w r12, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r6, d10 ; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s11 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: adds r4, r5, #1 -; CHECK-NEXT: adc r3, r3, #0 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov q5[2], q5[0], r4, r12 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s19 -; CHECK-NEXT: adcs r3, r6 -; CHECK-NEXT: adds.w r12, r5, #1 -; CHECK-NEXT: adc r3, r3, #0 -; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vmov r3, r4, d6 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds r6, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: lsrl r6, r3, #1 +; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 +; CHECK-NEXT: vmov r3, r6, d3 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds.w r12, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r5, r6, d11 ; CHECK-NEXT: lsrl r12, r3, #1 -; CHECK-NEXT: vmov r6, s18 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: adds r4, r5, #1 -; CHECK-NEXT: adc r3, r3, #0 -; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r12 -; CHECK-NEXT: vstrb.8 q5, [r2], #16 +; CHECK-NEXT: vmov r3, r4, d7 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds r6, r3, #1 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: lsrl r6, r3, #1 +; CHECK-NEXT: vmov q4[3], q4[1], r6, r12 +; CHECK-NEXT: vstrb.8 q4, [r2], #16 ; CHECK-NEXT: le lr, .LBB23_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9, d10, d11} Index: llvm/test/CodeGen/Thumb2/mve-vld2-post.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld2-post.ll +++ llvm/test/CodeGen/Thumb2/mve-vld2-post.ll @@ -66,32 +66,28 @@ define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) { ; CHECK-LABEL: vld2_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0], #32 -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r12, s7 -; CHECK-NEXT: vmov lr, s3 -; CHECK-NEXT: adds r6, r3, r2 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adc.w r12, r12, lr -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov r4, r7, d4 +; CHECK-NEXT: vmov r2, r5, d0 +; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vmov r3, r6, d1 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r6, r6, r12 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adcs r7, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r7, r6 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %l1 = load <4 x i64>, <4 x i64>* %src, align 4 %s1 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> Index: llvm/test/CodeGen/Thumb2/mve-vld2.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -319,32 +319,28 @@ define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) { ; CHECK-LABEL: vld2_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r12, s7 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov r0, r4, d4 +; CHECK-NEXT: vmov r5, r6, d0 +; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vmov r3, r2, d1 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds r0, r0, r5 +; CHECK-NEXT: adcs r6, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r6, r2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %l1 = load <4 x i64>, <4 x i64>* %src, align 8 %s1 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> @@ -357,58 +353,50 @@ define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) { ; CHECK-LABEL: vld2_v4i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s10, s22 -; CHECK-NEXT: vmov.f32 s2, s20 -; CHECK-NEXT: vmov.f32 s11, s23 -; CHECK-NEXT: vmov.f32 s3, s21 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.f64 d6, d3 -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov.f32 s7, s17 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: adcs r0, r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov q3[2], q3[0], r5, r3 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: adds r4, r4, r6 -; CHECK-NEXT: vmov q1[2], q1[0], r4, lr -; CHECK-NEXT: adcs r0, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r12 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f64 d8, d5 +; CHECK-NEXT: vmov.f32 s17, s11 +; CHECK-NEXT: vmov.f32 s18, s14 +; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov.f32 s19, s15 +; CHECK-NEXT: vmov.f32 s11, s13 +; CHECK-NEXT: vmov r0, r7, d8 +; CHECK-NEXT: vmov r5, r6, d4 +; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, r4, d9 +; CHECK-NEXT: adds r0, r0, r5 +; CHECK-NEXT: adc.w r8, r6, r7 +; CHECK-NEXT: vmov r6, r5, d5 +; CHECK-NEXT: vmov r2, r7, d0 +; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: adc.w r6, r5, r4 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r8, r6 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: adc.w r0, r7, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 8 %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <4 x i32> @@ -576,8 +564,7 @@ ; CHECK-NEXT: vins.f16 s5, s8 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] ; CHECK-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-vld3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -6,20 +6,21 @@ define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) { ; CHECK-LABEL: vld3_v2i32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldrd r12, r3, [r0, #16] -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrd r2, r0, [r0, #16] ; CHECK-NEXT: vmov.f64 d2, d0 ; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov r12, lr, d0 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: add.w r3, r12, lr ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: add r2, r12 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: bx lr +; CHECK-NEXT: strd r2, r0, [r1] +; CHECK-NEXT: pop {r7, pc} entry: %l1 = load <6 x i32>, <6 x i32>* %src, align 4 %s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> @@ -250,30 +251,30 @@ define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) { ; CHECK-LABEL: vld3_v4i16: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov.u16 r3, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 +; CHECK-NEXT: vldrh.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.u16 r5, q0[6] +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vmov r0, r3, d2 +; CHECK-NEXT: vmov.u16 lr, q0[2] +; CHECK-NEXT: vmov r2, r4, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r6, r5 +; CHECK-NEXT: vmov.u16 r5, q0[7] +; CHECK-NEXT: vmov.u16 r6, q0[1] +; CHECK-NEXT: vmov q2[2], q2[0], r6, r5 +; CHECK-NEXT: vmov.u16 r5, q0[3] +; CHECK-NEXT: vmov.u16 r6, q0[4] +; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r6, r2 +; CHECK-NEXT: vmov.u16 r12, q0[5] ; CHECK-NEXT: vadd.i32 q0, q1, q2 +; CHECK-NEXT: vmov q1[2], q1[0], lr, r0 +; CHECK-NEXT: vmov q1[3], q1[1], r12, r4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vstrh.32 q0, [r1] -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %l1 = load <12 x i16>, <12 x i16>* %src, align 4 %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> @@ -747,48 +748,37 @@ define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) { ; CHECK-LABEL: vld3_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d6, d3 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s14, s16 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.f32 s15, s17 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov r12, s15 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, lr, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: adc.w r12, r2, r3 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vmov.f64 d6, d1 +; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov r5, r8, d6 +; CHECK-NEXT: vmov r6, r7, d0 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: vmov lr, r12, d7 +; CHECK-NEXT: vmov r2, r4, d5 +; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r2, r3, r4 +; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: adds r6, r6, r5 +; CHECK-NEXT: adc.w r7, r7, r8 +; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: adcs r7, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r7, r2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <6 x i64>, <6 x i64>* %src, align 4 %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> @@ -805,86 +795,65 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f64 d8, d5 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vmov.f32 s19, s5 -; CHECK-NEXT: vmov.f64 d12, d11 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.f32 s25, s23 -; CHECK-NEXT: vmov.f32 s26, s4 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmov.f32 s27, s5 -; CHECK-NEXT: vmov.f32 s23, s15 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.f64 d14, d6 -; CHECK-NEXT: vmov r12, s27 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmov.f32 s29, s13 -; CHECK-NEXT: vmov.f32 s30, s6 -; CHECK-NEXT: vmov.f32 s31, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov r6, s8 -; CHECK-NEXT: vmov r7, s24 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q6, [r0, #80] +; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vldrw.u32 q4, [r0, #64] +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s7, s13 +; CHECK-NEXT: vmov.f32 s11, s15 +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vmov.f64 d10, d7 +; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: vmov r3, r8, d5 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov.f32 s22, s24 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s23, s25 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov r6, r7, d10 +; CHECK-NEXT: vmov.f32 s19, s27 +; CHECK-NEXT: adds.w r0, r5, lr +; CHECK-NEXT: adc.w r5, r4, r12 ; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s30 -; CHECK-NEXT: adc.w r3, r2, r12 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: adds.w lr, lr, r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: adc.w r8, r2, r3 -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s21 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r0 -; CHECK-NEXT: vmov r0, s29 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov r4, r2, d6 +; CHECK-NEXT: adc.w r12, r5, r8 +; CHECK-NEXT: vmov r5, r0, d8 +; CHECK-NEXT: adds r6, r6, r4 +; CHECK-NEXT: adcs r2, r7 +; CHECK-NEXT: adds r6, r6, r5 +; CHECK-NEXT: adc.w r8, r2, r0 +; CHECK-NEXT: vmov r7, r4, d11 +; CHECK-NEXT: vmov r2, r5, d7 +; CHECK-NEXT: vmov r3, r0, d0 ; CHECK-NEXT: adds r2, r2, r7 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r8 +; CHECK-NEXT: adc.w r7, r5, r4 +; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r7, r4 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: vmov q1[2], q1[0], r6, r2 +; CHECK-NEXT: vmov q1[3], q1[1], r8, r7 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: vmov r6, s28 -; CHECK-NEXT: adds r2, r2, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: adds r3, r3, r5 ; CHECK-NEXT: adcs r0, r4 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-NEXT: adcs r0, r5 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <12 x i64>, <12 x i64>* %src, align 4 @@ -1133,8 +1102,8 @@ define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) { ; CHECK-LABEL: vld3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: ldrd r2, r3, [r0, #16] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.32 q2[1], r3 @@ -1144,11 +1113,11 @@ ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmovx.f16 s8, s8 ; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmovx.f16 s16, s6 +; CHECK-NEXT: vmovx.f16 s16, s5 ; CHECK-NEXT: vins.f16 s12, s6 -; CHECK-NEXT: vmovx.f16 s18, s5 +; CHECK-NEXT: vins.f16 s4, s16 +; CHECK-NEXT: vmovx.f16 s16, s6 ; CHECK-NEXT: vins.f16 s5, s16 -; CHECK-NEXT: vins.f16 s4, s18 ; CHECK-NEXT: vmovx.f16 s13, s7 ; CHECK-NEXT: vins.f16 s7, s8 ; CHECK-NEXT: vmov.f32 s0, s5 @@ -1156,10 +1125,9 @@ ; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vadd.f16 q1, q1, q3 ; CHECK-NEXT: vadd.f16 q0, q1, q0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <12 x half>, <12 x half>* %src, align 4 Index: llvm/test/CodeGen/Thumb2/mve-vld4-post.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld4-post.ll +++ llvm/test/CodeGen/Thumb2/mve-vld4-post.ll @@ -102,62 +102,51 @@ define <8 x i64> *@vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) { ; CHECK-LABEL: vld4_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q0, [r0], #64 -; CHECK-NEXT: vldrw.u32 q3, [r0, #-48] -; CHECK-NEXT: vldrw.u32 q5, [r0, #-16] +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov.f64 d8, d7 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f32 s15, s21 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s14 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s10 ; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 ; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov r12, s19 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: adds r6, r3, r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adc.w r12, r12, lr -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: adds.w lr, r5, r6 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r6, s17 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: adcs r6, r5 -; CHECK-NEXT: vmov r5, s5 -; CHECK-NEXT: adds r3, r3, r7 -; CHECK-NEXT: adcs r4, r5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r4, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vldrw.u32 q2, [r0], #64 +; CHECK-NEXT: vmov.f64 d6, d5 +; CHECK-NEXT: vmov.f32 s13, s11 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r2, r7, d1 +; CHECK-NEXT: vmov r4, r8, d7 +; CHECK-NEXT: vmov r3, r6, d5 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r7, r7, r12 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, r5, d2 +; CHECK-NEXT: adc.w r6, r6, r8 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adc.w lr, r6, r7 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r6, r4, d6 +; CHECK-NEXT: adcs r2, r5 +; CHECK-NEXT: vmov r5, r7, d4 +; CHECK-NEXT: adds r5, r5, r6 +; CHECK-NEXT: adcs r4, r7 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r12 +; CHECK-NEXT: vmov q0[3], q0[1], r2, lr ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 4 %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> Index: llvm/test/CodeGen/Thumb2/mve-vld4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -644,59 +644,51 @@ define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) { ; CHECK-LABEL: vld4_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f64 d8, d7 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f32 s15, s21 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov r12, s19 -; CHECK-NEXT: vmov r2, s15 ; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s10 ; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 ; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s13 -; CHECK-NEXT: adcs r0, r3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r4, r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f64 d6, d5 +; CHECK-NEXT: vmov.f32 s13, s11 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r0, r8, d7 +; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: vmov r4, r12, d2 ; CHECK-NEXT: adds r0, r0, r5 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vmov r5, r7, d0 +; CHECK-NEXT: adc.w r6, r6, r8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w lr, r6, r3 +; CHECK-NEXT: vmov r3, r6, d6 +; CHECK-NEXT: adds r5, r5, r4 +; CHECK-NEXT: vmov r4, r2, d4 +; CHECK-NEXT: adc.w r7, r7, r12 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r2, r6 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r2, r7 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r2, lr ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 8 %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> @@ -717,112 +709,90 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldrw.u32 q0, [r0, #96] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d3 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vmov.f64 d14, d9 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s29, s19 -; CHECK-NEXT: vmov.f32 s30, s2 -; CHECK-NEXT: vmov.f64 d4, d13 -; CHECK-NEXT: vmov.f32 s31, s3 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vmov.f32 s9, s27 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s26, s0 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s27, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov.f64 d10, d7 -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s23, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.f64 d6, d3 +; CHECK-NEXT: vldrw.u32 q6, [r0, #112] +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #96] +; CHECK-NEXT: vmov.f64 d4, d11 +; CHECK-NEXT: vmov.f32 s9, s23 +; CHECK-NEXT: vmov r3, r2, d7 +; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vmov.f32 s11, s19 +; CHECK-NEXT: vmov.f32 s22, s16 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vldrw.u32 q4, [r0, #64] +; CHECK-NEXT: vmov q7, q5 +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] +; CHECK-NEXT: vmov r0, r6, d15 +; CHECK-NEXT: vmov.f64 d14, d11 +; CHECK-NEXT: vmov.f32 s29, s23 +; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vmov.f32 s30, s26 +; CHECK-NEXT: vmov.f32 s22, s24 +; CHECK-NEXT: vmov.f32 s31, s27 +; CHECK-NEXT: vmov.f32 s23, s25 +; CHECK-NEXT: vmov.f64 d12, d9 +; CHECK-NEXT: adds r7, r4, r3 +; CHECK-NEXT: adcs r5, r2 +; CHECK-NEXT: vmov r4, r8, d14 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov.f32 s25, s19 +; CHECK-NEXT: vmov.f32 s26, s2 ; CHECK-NEXT: vmov.f32 s18, s0 +; CHECK-NEXT: vmov.f32 s27, s3 ; CHECK-NEXT: vmov.f32 s19, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s28 -; CHECK-NEXT: adcs r0, r2 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s29 -; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: adc.w r6, r6, r12 +; CHECK-NEXT: adds.w lr, r0, r7 +; CHECK-NEXT: adc.w r12, r6, r5 +; CHECK-NEXT: vmov r6, r5, d12 ; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r3, r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov r7, s6 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s18 -; CHECK-NEXT: adcs r4, r0 -; CHECK-NEXT: adds.w r9, r5, r2 -; CHECK-NEXT: vmov r5, s30 -; CHECK-NEXT: adc.w r8, r4, r3 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: vmov r4, s19 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: adds r3, r3, r7 -; CHECK-NEXT: vmov r7, s12 -; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: vmov r4, r0, d8 +; CHECK-NEXT: adc.w r3, r3, r8 +; CHECK-NEXT: adds r6, r6, r4 +; CHECK-NEXT: adcs r0, r5 +; CHECK-NEXT: adds.w r9, r6, r2 +; CHECK-NEXT: adc.w r8, r0, r3 +; CHECK-NEXT: vmov r5, r4, d15 +; CHECK-NEXT: vmov r3, r6, d11 +; CHECK-NEXT: vmov r7, r0, d9 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r6, r4 +; CHECK-NEXT: vmov r5, r4, d13 +; CHECK-NEXT: adds r5, r5, r7 +; CHECK-NEXT: adcs r0, r4 ; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: vmov r6, s20 -; CHECK-NEXT: adc.w r10, r4, r2 -; CHECK-NEXT: vmov r4, s21 +; CHECK-NEXT: adc.w r10, r0, r6 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vmov r6, r7, d0 +; CHECK-NEXT: vmov r2, r0, d2 ; CHECK-NEXT: vmov q1[2], q1[0], r9, r3 -; CHECK-NEXT: vmov r5, s13 ; CHECK-NEXT: vmov q1[3], q1[1], r8, r10 -; CHECK-NEXT: vmov r2, s24 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: adds r6, r6, r7 -; CHECK-NEXT: vmov r7, s25 -; CHECK-NEXT: adcs r4, r5 -; CHECK-NEXT: vmov r5, s9 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r7, r5 -; CHECK-NEXT: adds r0, r0, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: adc.w r0, r4, r2 +; CHECK-NEXT: adds r4, r4, r6 +; CHECK-NEXT: adcs r5, r7 +; CHECK-NEXT: vmov r6, r7, d6 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: adcs r0, r7 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: adcs r0, r5 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: @@ -1101,31 +1071,30 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vldrh.u16 q1, [r0, #16] -; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vmovx.f16 s9, s5 -; CHECK-NEXT: vins.f16 s9, s12 -; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vldrh.u16 q2, [r0, #16] +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s16, s3 +; CHECK-NEXT: vmovx.f16 s12, s10 +; CHECK-NEXT: vmovx.f16 s5, s8 +; CHECK-NEXT: vins.f16 s5, s12 +; CHECK-NEXT: vmovx.f16 s12, s1 ; CHECK-NEXT: vins.f16 s12, s16 -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vmovx.f16 s13, s4 +; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vmovx.f16 s16, s11 +; CHECK-NEXT: vmovx.f16 s13, s9 +; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vins.f16 s13, s16 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vins.f16 s9, s11 ; CHECK-NEXT: vmov.f32 s16, s1 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s17, s5 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vadd.f16 q3, q4, q3 ; CHECK-NEXT: vadd.f16 q0, q0, q3 -; CHECK-NEXT: vadd.f16 q2, q4, q2 -; CHECK-NEXT: vadd.f16 q0, q0, q2 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr Index: llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll +++ llvm/test/CodeGen/Thumb2/mve-vmaxv-vminv-scalar.ll @@ -495,19 +495,16 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: cmp r3, r2 +; CHECK-NEXT: csel r4, r3, r2, lo ; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: csel r4, r2, r3, lo -; CHECK-NEXT: cmp r2, r3 -; CHECK-NEXT: csel r2, r2, r3, lo -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: csel r5, r2, r4, eq +; CHECK-NEXT: csel r2, r3, r2, lo ; CHECK-NEXT: csel r3, lr, r12, lo +; CHECK-NEXT: csel r5, r4, r2, eq +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: subs r2, r5, r0 -; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: sbcs.w r2, r3, r1 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r4, #1 @@ -526,19 +523,16 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: csel r4, r2, r3, lt -; CHECK-NEXT: cmp r2, r3 -; CHECK-NEXT: csel r2, r2, r3, lo +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: cmp r3, r2 +; CHECK-NEXT: csel r4, r3, r2, lo ; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: csel r5, r2, r4, eq +; CHECK-NEXT: csel r2, r3, r2, lt ; CHECK-NEXT: csel r3, lr, r12, lt +; CHECK-NEXT: csel r5, r4, r2, eq +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: subs r2, r5, r0 -; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: sbcs.w r2, r3, r1 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r4, #1 @@ -557,19 +551,16 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: cmp r3, r2 +; CHECK-NEXT: csel r4, r3, r2, hi ; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: csel r4, r2, r3, hi -; CHECK-NEXT: cmp r2, r3 -; CHECK-NEXT: csel r2, r2, r3, hi -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: csel r5, r2, r4, eq +; CHECK-NEXT: csel r2, r3, r2, hi ; CHECK-NEXT: csel r3, lr, r12, hi +; CHECK-NEXT: csel r5, r4, r2, eq +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: subs r2, r0, r5 -; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: sbcs.w r2, r1, r3 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r4, #1 @@ -588,19 +579,16 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: csel r4, r2, r3, gt -; CHECK-NEXT: cmp r2, r3 -; CHECK-NEXT: csel r2, r2, r3, hi +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: cmp r3, r2 +; CHECK-NEXT: csel r4, r3, r2, hi ; CHECK-NEXT: cmp lr, r12 -; CHECK-NEXT: csel r5, r2, r4, eq +; CHECK-NEXT: csel r2, r3, r2, gt ; CHECK-NEXT: csel r3, lr, r12, gt +; CHECK-NEXT: csel r5, r4, r2, eq +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: subs r2, r0, r5 -; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: sbcs.w r2, r1, r3 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r4, #1 Index: llvm/test/CodeGen/Thumb2/mve-vmovn.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmovn.ll +++ llvm/test/CodeGen/Thumb2/mve-vmovn.ll @@ -876,12 +876,11 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32_badlanes(<4 x i32> %src1) { ; CHECK-LABEL: vmovn32_badlanes: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.16 q1[5], r0 ; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.16 q1[3], r1 +; CHECK-NEXT: vmov.16 q1[5], r1 ; CHECK-NEXT: vmov.16 q1[7], r0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -889,12 +888,11 @@ ; CHECKBE-LABEL: vmovn32_badlanes: ; CHECKBE: @ %bb.0: @ %entry ; CHECKBE-NEXT: vrev64.32 q1, q0 -; CHECKBE-NEXT: vmov r0, s4 +; CHECKBE-NEXT: vmov r0, r1, d2 ; CHECKBE-NEXT: vmov.16 q2[1], r0 -; CHECKBE-NEXT: vmov r0, s5 -; CHECKBE-NEXT: vmov.16 q2[3], r0 -; CHECKBE-NEXT: vmov.16 q2[5], r0 ; CHECKBE-NEXT: vmov r0, s6 +; CHECKBE-NEXT: vmov.16 q2[3], r1 +; CHECKBE-NEXT: vmov.16 q2[5], r1 ; CHECKBE-NEXT: vmov.16 q2[7], r0 ; CHECKBE-NEXT: vrev64.16 q0, q2 ; CHECKBE-NEXT: bx lr Index: llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll +++ llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll @@ -15,18 +15,14 @@ ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmullb.s32 q2, q1, q0 ; CHECK-NEXT: vmullt.s32 q3, q1, q0 -; CHECK-NEXT: vmov r5, s11 -; CHECK-NEXT: vmov r12, s10 +; CHECK-NEXT: vmov r12, r5, d5 ; CHECK-NEXT: lsrl r12, r5, #31 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r5, s9 +; CHECK-NEXT: vmov r4, r5, d4 ; CHECK-NEXT: lsrl r4, r5, #31 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r12 -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: vmov r12, r5, d7 ; CHECK-NEXT: lsrl r12, r5, #31 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: vmov r4, r5, d6 ; CHECK-NEXT: lsrl r4, r5, #31 ; CHECK-NEXT: vmov q2[3], q2[1], r4, r12 ; CHECK-NEXT: vstrb.8 q2, [r2], #16 Index: llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll +++ llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll @@ -85,22 +85,20 @@ ; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.u16 r1, q1[1] ; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r1, q2[4] ; CHECK-NEXT: vmullb.s16 q0, q3, q0 ; CHECK-NEXT: vmov.i32 q3, #0x7fff ; CHECK-NEXT: vshl.i32 q0, q0, #10 ; CHECK-NEXT: vshr.s32 q0, q0, #10 ; CHECK-NEXT: vshr.s32 q0, q0, #15 ; CHECK-NEXT: vmin.s32 q4, q0, q3 -; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.16 q0[1], r1 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov.u16 r1, q2[4] ; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q2[7] ; CHECK-NEXT: vmov.u16 r1, q2[5] @@ -116,14 +114,12 @@ ; CHECK-NEXT: vshr.s32 q1, q1, #10 ; CHECK-NEXT: vshr.s32 q1, q1, #15 ; CHECK-NEXT: vmin.s32 q1, q1, q3 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.16 q0[7], r1 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-vqmovn.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vqmovn.ll +++ llvm/test/CodeGen/Thumb2/mve-vqmovn.ll @@ -164,22 +164,20 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_smaxmin(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_smaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: mvn r12, #-2147483648 -; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: subs.w r2, r2, r12 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: subs.w r1, r1, r12 +; CHECK-NEXT: sbcs r1, r2, #0 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: subs.w r3, r3, r12 +; CHECK-NEXT: subs.w r2, r2, r12 ; CHECK-NEXT: mov.w r12, #-1 -; CHECK-NEXT: sbcs r2, r2, #0 +; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r2, #1 @@ -192,19 +190,17 @@ ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vbic q2, q2, q1 ; CHECK-NEXT: vorr q0, q0, q2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: sbcs.w r1, r12, r1 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r12, r2 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: rsbs.w r3, r3, #-2147483648 -; CHECK-NEXT: sbcs.w r2, r12, r2 +; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 +; CHECK-NEXT: sbcs.w r2, r12, r3 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 @@ -240,21 +236,19 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_sminmax(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_sminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r1, r2, d1 ; CHECK-NEXT: mov.w r12, #-1 -; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: sbcs.w r1, r12, r1 +; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r12, r2 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: rsbs.w r3, r3, #-2147483648 -; CHECK-NEXT: sbcs.w r2, r12, r2 +; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 +; CHECK-NEXT: sbcs.w r2, r12, r3 ; CHECK-NEXT: mvn r12, #-2147483648 ; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lt @@ -268,19 +262,17 @@ ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vbic q2, q2, q1 ; CHECK-NEXT: vorr q0, q0, q2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: subs.w r2, r2, r12 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: subs.w r1, r1, r12 +; CHECK-NEXT: sbcs r1, r2, #0 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: subs.w r3, r3, r12 -; CHECK-NEXT: sbcs r2, r2, #0 +; CHECK-NEXT: subs.w r2, r2, r12 +; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 @@ -316,21 +308,19 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_umaxmin(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_umaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: subs.w r1, r1, #-1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: sbcs r0, r0, #0 +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r3, r3, #-1 -; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: subs.w r1, r1, #-1 +; CHECK-NEXT: sbcs r1, r3, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r2, #1 ; CHECK-NEXT: cmp r2, #0 @@ -350,21 +340,19 @@ define arm_aapcs_vfpcc <2 x i64> @vqmovni64_uminmax(<2 x i64> %s0) { ; CHECK-LABEL: vqmovni64_uminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: subs.w r1, r1, #-1 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: sbcs r0, r0, #0 +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs.w r3, r3, #-1 -; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: subs.w r1, r1, #-1 +; CHECK-NEXT: sbcs r1, r3, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r2, #1 ; CHECK-NEXT: cmp r2, #0 Index: llvm/test/CodeGen/Thumb2/mve-vqshrn.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vqshrn.ll +++ llvm/test/CodeGen/Thumb2/mve-vqshrn.ll @@ -180,56 +180,52 @@ define arm_aapcs_vfpcc <2 x i64> @vqshrni64_smaxmin(<2 x i64> %so) { ; CHECK-LABEL: vqshrni64_smaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: mvn r12, #-2147483648 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: mov.w lr, #0 -; CHECK-NEXT: asrl r2, r1, #3 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: subs.w r3, r2, r12 -; CHECK-NEXT: sbcs r3, r1, #0 -; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: mvn lr, #-2147483648 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: asrl r2, r3, #3 +; CHECK-NEXT: asrl r0, r1, #3 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: subs.w r2, r2, lr +; CHECK-NEXT: sbcs r2, r3, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: asrl r4, r3, #3 -; CHECK-NEXT: subs.w r0, r4, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 -; CHECK-NEXT: sbcs r0, r3, #0 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r1 +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: csetm r2, ne +; CHECK-NEXT: subs.w r0, r0, lr +; CHECK-NEXT: sbcs r0, r1, #0 ; CHECK-NEXT: mov.w r0, #0 -; CHECK-NEXT: mov.w r2, #-1 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q0[2], q0[0], r0, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r5 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r2 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r2 ; CHECK-NEXT: adr r0, .LCPI12_0 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: mov.w r2, #-1 +; CHECK-NEXT: vbic q1, q2, q1 ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: sbcs.w r0, r2, r0 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r2, r1 +; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: rsbs.w r3, r3, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r2, r1 +; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r2, r3 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt.w lr, #1 -; CHECK-NEXT: cmp.w lr, #0 +; CHECK-NEXT: movlt.w r12, #1 +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 @@ -238,7 +234,7 @@ ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vbic q2, q2, q1 ; CHECK-NEXT: vorr q0, q0, q2 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI12_0: @@ -265,19 +261,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r2, r1, d1 ; CHECK-NEXT: mov.w r12, #-1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: mov.w lr, #0 ; CHECK-NEXT: asrl r2, r1, #3 -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: mov.w lr, #0 ; CHECK-NEXT: rsbs.w r3, r2, #-2147483648 ; CHECK-NEXT: sbcs.w r3, r12, r1 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r4, r3, d0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: asrl r4, r3, #3 ; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 @@ -297,19 +291,17 @@ ; CHECK-NEXT: vbic q1, q1, q0 ; CHECK-NEXT: vand q0, q2, q0 ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: subs r1, r1, r2 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: sbcs r0, r0, #0 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: vmov r1, r3, d0 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: sbcs r1, r1, #0 +; CHECK-NEXT: subs r1, r1, r2 +; CHECK-NEXT: sbcs r1, r3, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w lr, #1 ; CHECK-NEXT: cmp.w lr, #0 @@ -346,37 +338,33 @@ define arm_aapcs_vfpcc <2 x i64> @vqshrni64_umaxmin(<2 x i64> %so) { ; CHECK-LABEL: vqshrni64_umaxmin: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: lsrl r0, r5, #3 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: subs.w r3, r0, #-1 -; CHECK-NEXT: sbcs r3, r5, #0 -; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov r2, r1, d0 +; CHECK-NEXT: lsrl r0, r3, #3 +; CHECK-NEXT: lsrl r2, r1, #3 +; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r3, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: csetm r12, ne -; CHECK-NEXT: lsrl r4, r3, #3 -; CHECK-NEXT: subs.w r1, r4, #-1 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r0 -; CHECK-NEXT: sbcs r1, r3, #0 +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: subs.w r2, r2, #-1 +; CHECK-NEXT: sbcs r1, r1, #0 ; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r5 +; CHECK-NEXT: movlo.w r12, #1 +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r12 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r12 -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vbic q1, q2, q1 ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: bx lr entry: %s0 = lshr <2 x i64> %so, %c1 = icmp ult <2 x i64> %s0, @@ -387,37 +375,33 @@ define arm_aapcs_vfpcc <2 x i64> @vqshrni64_uminmax(<2 x i64> %so) { ; CHECK-LABEL: vqshrni64_uminmax: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: lsrl r0, r5, #3 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: subs.w r3, r0, #-1 -; CHECK-NEXT: sbcs r3, r5, #0 -; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov r2, r1, d0 +; CHECK-NEXT: lsrl r0, r3, #3 +; CHECK-NEXT: lsrl r2, r1, #3 +; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: sbcs r0, r3, #0 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: csetm r12, ne -; CHECK-NEXT: lsrl r4, r3, #3 -; CHECK-NEXT: subs.w r1, r4, #-1 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r0 -; CHECK-NEXT: sbcs r1, r3, #0 +; CHECK-NEXT: movlo r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: csetm r0, ne +; CHECK-NEXT: subs.w r2, r2, #-1 +; CHECK-NEXT: sbcs r1, r1, #0 ; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r5 +; CHECK-NEXT: movlo.w r12, #1 +; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: vmov q0[2], q0[0], r1, r12 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r12 -; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vbic q1, q2, q1 ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: bx lr entry: %s0 = lshr <2 x i64> %so, %c2 = icmp ult <2 x i64> %s0, Index: llvm/test/CodeGen/Thumb2/mve-vst2.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -520,9 +520,8 @@ ; CHECK-NEXT: vins.f16 s4, s0 ; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: str r0, [r1, #4] -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r0, r2, d2 +; CHECK-NEXT: str r2, [r1, #4] ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-vst3.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -301,35 +301,31 @@ define void @vst3_v4i16(<4 x i16> *%src, <12 x i16> *%dst) { ; CHECK-LABEL: vst3_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrh.u32 q1, [r0] -; CHECK-NEXT: vldrh.u32 q3, [r0, #8] +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrh.u32 q2, [r0, #16] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.f64 d0, d5 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.f32 s1, s7 -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.f32 s2, s15 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vstrh.32 q0, [r1, #16] -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vldrh.u32 q1, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r0, #8] +; CHECK-NEXT: vmov.f64 d6, d5 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov r0, r5, d2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov lr, r4, d1 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov r0, r4, d4 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov r12, s6 +; CHECK-NEXT: vmov.16 q0[3], r5 +; CHECK-NEXT: vstrh.32 q3, [r1, #16] +; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: vmov.16 q0[5], r4 +; CHECK-NEXT: vmov.16 q0[6], r12 +; CHECK-NEXT: vmov.16 q0[7], lr +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0 %l1 = load <4 x i16>, <4 x i16>* %s1, align 4 @@ -621,40 +617,33 @@ define void @vst3_v4i8(<4 x i8> *%src, <12 x i8> *%dst) { ; CHECK-LABEL: vst3_v4i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] ; CHECK-NEXT: vldrb.u32 q1, [r0] -; CHECK-NEXT: vldrb.u32 q2, [r0, #4] -; CHECK-NEXT: vldrb.u32 q3, [r0, #8] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.8 q4[8], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.8 q4[9], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.8 q4[10], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.8 q4[11], r0 -; CHECK-NEXT: vstrb.16 q0, [r1] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: str r0, [r1, #8] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r2, lr, d0 +; CHECK-NEXT: vmov r12, r3, d1 +; CHECK-NEXT: vldrb.u32 q0, [r0, #8] +; CHECK-NEXT: vmov r0, r6, d3 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: vmov.8 q2[8], r4 +; CHECK-NEXT: vmov.8 q2[9], r6 +; CHECK-NEXT: vmov.8 q2[10], r3 +; CHECK-NEXT: vmov.8 q2[11], r5 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: str r3, [r1, #8] +; CHECK-NEXT: vmov r3, r4, d2 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov r3, r5, d0 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov.16 q1[2], r3 +; CHECK-NEXT: vmov.16 q1[3], r4 +; CHECK-NEXT: vmov.16 q1[4], lr +; CHECK-NEXT: vmov.16 q1[5], r5 +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.16 q1[7], r12 +; CHECK-NEXT: vstrb.16 q1, [r1] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0 %l1 = load <4 x i8>, <4 x i8>* %s1, align 4 @@ -1313,11 +1302,9 @@ ; CHECK-NEXT: vins.f16 s4, s8 ; CHECK-NEXT: vins.f16 s2, s10 ; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: str r0, [r1, #8] -; CHECK-NEXT: strd r3, r2, [r1] +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, r2, d0 +; CHECK-NEXT: stm r1!, {r0, r2, r3} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 @@ -1365,11 +1352,10 @@ ; CHECK-NEXT: vins.f16 s5, s10 ; CHECK-NEXT: vins.f16 s17, s12 ; CHECK-NEXT: vmov.f32 s16, s5 -; CHECK-NEXT: vmov r2, s17 ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vmov.f32 s3, s8 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov r0, r2, d8 ; CHECK-NEXT: strd r0, r2, [r1, #16] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} Index: llvm/test/CodeGen/Thumb2/mve-vst4.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -203,40 +203,40 @@ define void @vst4_v4i32_align1(<4 x i32> *%src, <16 x i32> *%dst) { ; CHECK-LABEL: vst4_v4i32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s1, s5 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.f32 s16, s8 -; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vmov.f32 s20, s11 -; CHECK-NEXT: vmov.f32 s8, s10 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r12, lr, d0 +; CHECK-NEXT: vdup.32 q4, r3 +; CHECK-NEXT: vmov.f64 d0, d6 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vmov.f64 d4, d7 +; CHECK-NEXT: vmov.f32 s12, s15 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vstrb.8 q3, [r1, #48] ; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vstrb.8 q4, [r1] -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vstrb.8 q5, [r1, #48] +; CHECK-NEXT: vdup.32 q3, r2 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s11, s15 ; CHECK-NEXT: vstrb.8 q2, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vdup.32 q2, lr +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vstrb.8 q1, [r1, #16] +; CHECK-NEXT: vdup.32 q1, r12 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0 %l1 = load <4 x i32>, <4 x i32>* %s1, align 4 @@ -297,43 +297,36 @@ define void @vst4_v4i16(<4 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vst4_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrh.u32 q1, [r0] -; CHECK-NEXT: vldrh.u32 q2, [r0, #8] -; CHECK-NEXT: vldrh.u32 q3, [r0, #16] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.u32 q0, [r0, #8] +; CHECK-NEXT: vldrh.u32 q2, [r0, #16] +; CHECK-NEXT: vmov lr, r12, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: vmov.16 q1[0], r4 +; CHECK-NEXT: vmov r0, r4, d5 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov r0, r2, d4 +; CHECK-NEXT: vmov.16 q1[4], r5 +; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov r3, r5, d0 +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q1[6], r4 +; CHECK-NEXT: vmov.16 q0[1], lr +; CHECK-NEXT: vmov.16 q1[7], r4 ; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vstrh.16 q1, [r1, #16] ; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vstrh.16 q0, [r1, #16] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vstrh.16 q4, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.16 q0[4], r5 +; CHECK-NEXT: vmov.16 q0[5], r12 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0 %l1 = load <4 x i16>, <4 x i16>* %s1, align 4 @@ -531,39 +524,35 @@ define void @vst4_v4i8(<4 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vst4_v4i8: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrb.u32 q1, [r0] -; CHECK-NEXT: vldrb.u32 q2, [r0, #4] -; CHECK-NEXT: vldrb.u32 q3, [r0, #8] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: vldrb.u32 q2, [r0, #8] +; CHECK-NEXT: vmov r4, r5, d2 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov.8 q0[0], r4 +; CHECK-NEXT: vmov r0, r4, d4 ; CHECK-NEXT: vmov.8 q0[1], r2 -; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov.8 q0[2], r0 ; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r0, r2, d5 +; CHECK-NEXT: vmov.8 q0[4], r5 +; CHECK-NEXT: vmov.8 q0[5], r3 +; CHECK-NEXT: vmov r3, r5, d3 +; CHECK-NEXT: vmov.8 q0[6], r4 +; CHECK-NEXT: vmov.8 q0[7], r4 +; CHECK-NEXT: vmov.8 q0[8], r3 +; CHECK-NEXT: vmov.8 q0[9], lr ; CHECK-NEXT: vmov.8 q0[10], r0 ; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.8 q0[12], r5 +; CHECK-NEXT: vmov.8 q0[13], r12 +; CHECK-NEXT: vmov.8 q0[14], r2 +; CHECK-NEXT: vmov.8 q0[15], r2 ; CHECK-NEXT: vstrb.8 q0, [r1] -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0 %l1 = load <4 x i8>, <4 x i8>* %s1, align 4 @@ -984,33 +973,40 @@ define void @vst4_v4f32_align1(<4 x float> *%src, <16 x float> *%dst) { ; CHECK-LABEL: vst4_v4f32_align1: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.f32 s4, s13 -; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vmov.f32 s20, s15 -; CHECK-NEXT: vmov.f32 s5, s9 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s22, s3 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vstrb.8 q1, [r1, #16] -; CHECK-NEXT: vmov.f32 s17, s8 -; CHECK-NEXT: vmov.f32 s18, s0 -; CHECK-NEXT: vmov.f32 s19, s0 -; CHECK-NEXT: vmov.f32 s23, s3 -; CHECK-NEXT: vstrb.8 q4, [r1] -; CHECK-NEXT: vmov.f32 s0, s14 -; CHECK-NEXT: vstrb.8 q5, [r1, #48] -; CHECK-NEXT: vmov.f32 s1, s10 -; CHECK-NEXT: vmov.f32 s3, s2 -; CHECK-NEXT: vstrb.8 q0, [r1, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.f64 d2, d8 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vdup.32 q5, r3 +; CHECK-NEXT: vmov.f32 s8, s17 +; CHECK-NEXT: vmov.f64 d6, d9 +; CHECK-NEXT: vmov.f32 s16, s19 +; CHECK-NEXT: vmov.f32 s17, s11 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vstrb.8 q4, [r1, #48] +; CHECK-NEXT: vmov.f32 s13, s10 +; CHECK-NEXT: vdup.32 q4, r2 +; CHECK-NEXT: vmov r12, lr, d0 +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vmov.f32 s15, s2 +; CHECK-NEXT: vstrb.8 q3, [r1, #32] +; CHECK-NEXT: vdup.32 q3, lr +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s11, s15 +; CHECK-NEXT: vstrb.8 q2, [r1, #16] +; CHECK-NEXT: vdup.32 q2, r12 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vstrb.8 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0 %l1 = load <4 x float>, <4 x float>* %s1, align 4 Index: llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll +++ llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll @@ -50,10 +50,8 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pSrc, i32 %blockSize, <4 x i32> %a) { ; CHECK-LABEL: foo_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpt.s32 lt, q0, zr @@ -62,36 +60,31 @@ ; CHECK-NEXT: vmov.i64 q5, #0xffffffff ; CHECK-NEXT: vmov.f32 s2, s17 ; CHECK-NEXT: vand q6, q0, q5 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov r1, s25 -; CHECK-NEXT: bl __aeabi_ul2d -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov r1, s27 +; CHECK-NEXT: vmov r0, r1, d13 ; CHECK-NEXT: bl __aeabi_ul2d +; CHECK-NEXT: vmov r2, r3, d12 ; CHECK-NEXT: vmov.f64 d0, d9 ; CHECK-NEXT: vmov.f32 s2, s19 -; CHECK-NEXT: vand q0, q0, q5 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vmov d8, r4, r5 +; CHECK-NEXT: vand q5, q0, q5 +; CHECK-NEXT: vmov r4, r5, d11 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_ul2d +; CHECK-NEXT: vmov d8, r0, r1 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl __aeabi_ul2d +; CHECK-NEXT: vmov r2, r3, d10 ; CHECK-NEXT: vmov d11, r0, r1 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d10, r0, r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vmov q1, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %pSrc, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)