Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13794,22 +13794,51 @@ } // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d + // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT && isa(InDouble.getOperand(1))) { SDValue BV = InDouble.getOperand(0); - // Look up through any nop bitcasts - while (BV.getOpcode() == ISD::BITCAST && - (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) + // Look up through any nop bitcasts and vector_reg_casts. bitcasts may + // change lane order under big endian. + bool BVSwap = BV.getOpcode() == ISD::BITCAST; + while ( + (BV.getOpcode() == ISD::BITCAST || + BV.getOpcode() == ARMISD::VECTOR_REG_CAST) && + (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) { + BVSwap = BV.getOpcode() == ISD::BITCAST; BV = BV.getOperand(0); - if (BV.getValueType() != MVT::v4i32 || BV.getOpcode() != ISD::BUILD_VECTOR) + } + if (BV.getValueType() != MVT::v4i32) return SDValue(); + + // Handle buildvectors, pulling out the correct lane depending on + // endianness. unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0; - if (Subtarget->isLittle()) - return DCI.DAG.getMergeValues( - {BV.getOperand(Offset), BV.getOperand(Offset + 1)}, SDLoc(N)); - else - return DCI.DAG.getMergeValues( - {BV.getOperand(Offset + 1), BV.getOperand(Offset)}, SDLoc(N)); + if (BV.getOpcode() == ISD::BUILD_VECTOR) { + SDValue Op0 = BV.getOperand(Offset); + SDValue Op1 = BV.getOperand(Offset + 1); + if (!Subtarget->isLittle() && BVSwap) + std::swap(Op0, Op1); + + return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); + } + + // A chain of insert_vectors, grabbing the correct value of the chain of + // inserts. + SDValue Op0, Op1; + while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) { + if (isa(BV.getOperand(2))) { + if (BV.getConstantOperandVal(2) == Offset) + Op0 = BV.getOperand(1); + if (BV.getConstantOperandVal(2) == Offset + 1) + Op1 = BV.getOperand(1); + } + BV = BV.getOperand(0); + } + if (!Subtarget->isLittle() && BVSwap) + std::swap(Op0, Op1); + if (Op0 && Op1) + return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); } return SDValue(); Index: llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll =================================================================== --- llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll +++ llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll @@ -44,9 +44,14 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] ; CHECK-NEXT: vmovl.u8 q8, d16 -; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vmov.u16 r1, d16[1] +; CHECK-NEXT: vmov.u16 r2, d16[2] +; CHECK-NEXT: vmov.u16 r3, d16[3] +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: uxtb r1, r1 +; CHECK-NEXT: uxtb r2, r2 +; CHECK-NEXT: uxtb r3, r3 ; CHECK-NEXT: bx lr %1 = load <4 x i8>, <4 x i8>* %in, align 4 %2 = extractelement <4 x i8> %1, i32 0 Index: llvm/test/CodeGen/ARM/vdup.ll =================================================================== --- llvm/test/CodeGen/ARM/vdup.ll +++ llvm/test/CodeGen/ARM/vdup.ll @@ -100,9 +100,9 @@ define <4 x i32> @v_dupQ32(i32 %A) nounwind { ; CHECK-LABEL: v_dupQ32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q8, r0 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov pc, lr %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 @@ -379,10 +379,9 @@ define <4 x i32> @tdupi(i32 %x, i32 %y) { ; CHECK-LABEL: tdupi: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q8, r0 -; CHECK-NEXT: vmov.32 d17[1], r1 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: mov pc, lr %1 = insertelement <4 x i32> undef, i32 %x, i32 0 %2 = insertelement <4 x i32> %1, i32 %x, i32 1 @@ -412,11 +411,10 @@ ; CHECK-LABEL: tduplane: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: mov r0, #255 -; CHECK-NEXT: vdup.32 q8, d16[1] -; CHECK-NEXT: vmov.32 d17[1], r0 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov r3, #255 +; CHECK-NEXT: vmov.32 r0, d16[1] +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: mov pc, lr %in = extractelement <4 x i32> %invec, i32 1 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -52,16 +52,11 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x) { ; CHECK-LABEL: add_v2i32_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r3, r1, asr #31 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i32> %x to <2 x i64> @@ -174,51 +169,28 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x) { ; CHECK-LABEL: add_v8i16_v8i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.s16 r0, q0[1] -; CHECK-NEXT: vmov.s16 r1, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov.s16 r0, q0[0] +; CHECK-NEXT: vmov.s16 r2, q0[1] +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.s16 r2, q0[2] ; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.s16 r2, q0[3] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.s16 r3, q0[2] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.s16 r2, q0[4] ; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.s16 r2, q0[5] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.s16 r3, q0[4] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.s16 r2, q0[6] +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.s16 r2, q0[7] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.s16 r3, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: asrs r0, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r0 -; CHECK-NEXT: vmov r0, r3, d0 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: @@ -269,18 +241,13 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x) { ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: sxth r1, r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r3, r1, asr #31 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i16> %x to <2 x i64> @@ -525,99 +492,52 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x) { ; CHECK-LABEL: add_v16i8_v16i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.s8 r0, q0[1] -; CHECK-NEXT: vmov.s8 r1, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: vmov.s8 r0, q0[0] +; CHECK-NEXT: vmov.s8 r2, q0[1] +; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.s8 r2, q0[2] +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.s8 r2, q0[3] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.s8 r3, q0[2] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.s8 r2, q0[4] ; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.s8 r2, q0[5] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.s8 r3, q0[4] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.s8 r2, q0[6] ; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.s8 r2, q0[7] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.s8 r3, q0[6] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.s8 r2, q0[8] ; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.s8 r2, q0[9] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.s8 r3, q0[8] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.s8 r2, q0[10] ; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.s8 r2, q0[11] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.s8 r3, q0[10] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.s8 r2, q0[12] ; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.s8 r2, q0[13] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.s8 r3, q0[12] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.s8 r2, q0[14] +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.s8 r2, q0[15] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.s8 r3, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: asrs r0, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r0 -; CHECK-NEXT: vmov r0, r3, d0 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: @@ -675,59 +595,36 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x) { ; CHECK-LABEL: add_v8i8_v8i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r3, d2 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q0[4] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: asrs r0, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r0 -; CHECK-NEXT: vmov r0, r3, d0 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: @@ -780,18 +677,13 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x) { ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: sxtb r1, r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r3, r1, asr #31 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i8> %x to <2 x i64> @@ -871,21 +763,14 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) { ; CHECK-LABEL: add_v2i32_v2i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vmov r3, lr, d0 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r2, lr, r2, asr #31 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: bx lr entry: %xx = sext <2 x i32> %x to <2 x i64> %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) @@ -1008,57 +893,34 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov.s16 r2, q0[1] -; CHECK-NEXT: vmov.s16 r3, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov lr, r12, d3 -; CHECK-NEXT: vmov r3, r2, d2 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.s16 r3, q0[2] -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov.s16 r2, q0[3] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.s16 r2, q0[5] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov.s16 r2, q0[0] +; CHECK-NEXT: vmov.s16 r3, q0[1] +; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: adds.w lr, r2, r3 +; CHECK-NEXT: vmov.s16 r2, q0[2] +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adc.w r2, r3, r2, asr #31 +; CHECK-NEXT: vmov.s16 r3, q0[3] +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 ; CHECK-NEXT: vmov.s16 r3, q0[4] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.s16 r2, q0[7] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.s16 r3, q0[5] +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 ; CHECK-NEXT: vmov.s16 r3, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: asrs r4, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 -; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adc.w r4, r4, r12 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r4, r2, asr #31 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w lr, r2, r3, asr #31 +; CHECK-NEXT: vmov.s16 r3, q0[7] +; CHECK-NEXT: adds.w r2, r12, r3 +; CHECK-NEXT: adc.w r3, lr, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <8 x i16> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -1113,23 +975,16 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: sxth r3, r3 ; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vmov r3, lr, d0 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r2, lr, r2, asr #31 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: bx lr entry: %xx = sext <2 x i16> %x to <2 x i64> %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) @@ -1390,105 +1245,58 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov.s8 r2, q0[1] -; CHECK-NEXT: vmov.s8 r3, q0[0] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov lr, r12, d3 -; CHECK-NEXT: vmov r3, r2, d2 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.s8 r3, q0[2] -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov.s8 r2, q0[3] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.s8 r2, q0[5] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov.s8 r2, q0[0] +; CHECK-NEXT: vmov.s8 r3, q0[1] +; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: adds.w lr, r2, r3 +; CHECK-NEXT: vmov.s8 r2, q0[2] +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adc.w r2, r3, r2, asr #31 +; CHECK-NEXT: vmov.s8 r3, q0[3] +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 ; CHECK-NEXT: vmov.s8 r3, q0[4] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.s8 r2, q0[7] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.s8 r3, q0[5] +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 ; CHECK-NEXT: vmov.s8 r3, q0[6] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.s8 r2, q0[9] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.s8 r3, q0[7] +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 ; CHECK-NEXT: vmov.s8 r3, q0[8] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.s8 r2, q0[11] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.s8 r3, q0[9] +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 ; CHECK-NEXT: vmov.s8 r3, q0[10] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.s8 r2, q0[13] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.s8 r3, q0[11] +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 ; CHECK-NEXT: vmov.s8 r3, q0[12] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.s8 r2, q0[15] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.s8 r3, q0[13] +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 ; CHECK-NEXT: vmov.s8 r3, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: asrs r4, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 -; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adc.w r4, r4, r12 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r4, r2, asr #31 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w lr, r2, r3, asr #31 +; CHECK-NEXT: vmov.s8 r3, q0[15] +; CHECK-NEXT: adds.w r2, r12, r3 +; CHECK-NEXT: adc.w r3, lr, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) @@ -1550,65 +1358,42 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v8i8_v8i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov.u16 r3, q0[1] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov lr, r12, d3 -; CHECK-NEXT: vmov r3, r2, d2 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: adds.w lr, r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 ; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adc.w r2, r3, r2, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 ; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 ; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: asrs r4, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 -; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adc.w r4, r4, r12 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r4, r2, asr #31 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w lr, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[7] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r2, r12, r3 +; CHECK-NEXT: adc.w r3, lr, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <8 x i8> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -1665,23 +1450,16 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vmov r3, lr, d0 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r2, lr, r2, asr #31 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: bx lr entry: %xx = sext <2 x i8> %x to <2 x i64> %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) Index: llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -248,13 +248,7 @@ ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: umlal r0, r1, r3, r2 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -273,16 +267,10 @@ ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: sxth r1, r1 -; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i16> %x to <2 x i64> @@ -557,181 +545,137 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_v16i8_v16i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.u8 r1, q1[2] +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.u8 r1, q1[0] ; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r0 -; CHECK-NEXT: vmov.u8 r2, q1[1] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r1 -; CHECK-NEXT: vmov.u8 r3, q1[0] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umull r0, r12, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: smlabb r0, r3, r2, r0 +; CHECK-NEXT: vmov.u8 r2, q1[3] +; CHECK-NEXT: vmov.u8 r3, q1[2] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: vmov q4[2], q4[0], r1, r3 +; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov q5[2], q5[0], r1, r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vand q5, q5, q2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r1, s16 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov lr, r12, d6 ; CHECK-NEXT: umull r1, r2, r1, r2 -; CHECK-NEXT: smlabb r0, r0, r3, r1 -; CHECK-NEXT: adds.w r0, r0, lr -; CHECK-NEXT: adc.w r1, r2, r12 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: umlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[5] -; CHECK-NEXT: adc.w lr, r1, r3 ; CHECK-NEXT: vmov.u8 r3, q1[4] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[4] ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 -; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov r12, s12 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: umull r0, r2, r0, r2 -; CHECK-NEXT: umull r1, r3, r1, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umlal r0, r1, r2, r12 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q1[6] -; CHECK-NEXT: vmov.u8 r0, q0[6] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 +; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov r12, s12 ; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umlal r0, r1, r2, r12 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: umull r0, r2, r0, r2 -; CHECK-NEXT: umull r1, r3, r1, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[9] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q1[8] -; CHECK-NEXT: vmov.u8 r0, q0[8] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 +; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov r12, s12 ; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umlal r0, r1, r2, r12 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: umull r0, r2, r0, r2 -; CHECK-NEXT: umull r1, r3, r1, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[11] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q1[10] -; CHECK-NEXT: vmov.u8 r0, q0[10] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 +; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov r12, s12 ; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umlal r0, r1, r2, r12 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: umull r0, r2, r0, r2 -; CHECK-NEXT: umull r1, r3, r1, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[13] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q1[12] -; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 +; CHECK-NEXT: vmov.u8 r2, q0[12] ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov r12, s12 ; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: umlal r0, r1, r2, r12 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: umull r0, r2, r0, r2 -; CHECK-NEXT: umull r1, r3, r1, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov r2, r3, d7 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[15] +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q1[14] -; CHECK-NEXT: vmov.u8 r0, q0[14] ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov q0[2], q0[0], r0, r3 +; CHECK-NEXT: vmov.u8 r2, q0[14] ; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov r12, s4 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: umull r0, r2, r0, r2 -; CHECK-NEXT: umull r1, r3, r1, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: umlal r0, r1, r2, r12 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -743,119 +687,55 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_v16i8_v16i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.s8 r0, q1[1] ; CHECK-NEXT: vmov.s8 r1, q0[1] +; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov.s8 r2, q1[0] ; CHECK-NEXT: vmov.s8 r3, q0[0] -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r1 -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.s8 r2, q1[2] +; CHECK-NEXT: vmov.s8 r3, q0[2] +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.s8 r2, q1[3] -; CHECK-NEXT: adc.w lr, r3, r1 ; CHECK-NEXT: vmov.s8 r3, q0[3] -; CHECK-NEXT: vmov.s8 r0, q1[2] -; CHECK-NEXT: vmov.s8 r1, q0[2] -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.s8 r2, q1[4] +; CHECK-NEXT: vmov.s8 r3, q0[4] +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.s8 r2, q1[5] ; CHECK-NEXT: vmov.s8 r3, q0[5] -; CHECK-NEXT: vmov.s8 r0, q1[4] -; CHECK-NEXT: vmov.s8 r1, q0[4] -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.s8 r2, q1[6] +; CHECK-NEXT: vmov.s8 r3, q0[6] +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.s8 r2, q1[7] ; CHECK-NEXT: vmov.s8 r3, q0[7] -; CHECK-NEXT: vmov.s8 r0, q1[6] -; CHECK-NEXT: vmov.s8 r1, q0[6] -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.s8 r2, q1[8] +; CHECK-NEXT: vmov.s8 r3, q0[8] +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.s8 r2, q1[9] ; CHECK-NEXT: vmov.s8 r3, q0[9] -; CHECK-NEXT: vmov.s8 r0, q1[8] -; CHECK-NEXT: vmov.s8 r1, q0[8] -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.s8 r2, q1[10] +; CHECK-NEXT: vmov.s8 r3, q0[10] +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.s8 r2, q1[11] ; CHECK-NEXT: vmov.s8 r3, q0[11] -; CHECK-NEXT: vmov.s8 r0, q1[10] -; CHECK-NEXT: vmov.s8 r1, q0[10] -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.s8 r2, q1[12] +; CHECK-NEXT: vmov.s8 r3, q0[12] +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.s8 r2, q1[13] ; CHECK-NEXT: vmov.s8 r3, q0[13] -; CHECK-NEXT: vmov.s8 r0, q1[12] -; CHECK-NEXT: vmov.s8 r1, q0[12] -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r0, r1, d4 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.s8 r2, q1[14] +; CHECK-NEXT: vmov.s8 r3, q0[14] +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.s8 r2, q1[15] ; CHECK-NEXT: vmov.s8 r3, q0[15] -; CHECK-NEXT: vmov.s8 r0, q1[14] -; CHECK-NEXT: vmov.s8 r1, q0[14] -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i64> %yy = sext <16 x i8> %y to <16 x i64> @@ -959,16 +839,10 @@ ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i8> %x to <2 x i64> @@ -981,25 +855,21 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: add_v2i64_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; CHECK-NEXT: vmov r0, r12, d3 -; CHECK-NEXT: vmov r2, lr, d1 -; CHECK-NEXT: vmov r4, r9, d2 -; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: umull r1, r8, r2, r0 -; CHECK-NEXT: umull r3, r5, r6, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 -; CHECK-NEXT: mla r1, r2, r12, r8 -; CHECK-NEXT: mla r1, lr, r0, r1 -; CHECK-NEXT: mla r0, r6, r9, r5 -; CHECK-NEXT: mla r0, r7, r4, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov r0, lr, d3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: umull r12, r1, r2, r0 +; CHECK-NEXT: mla r1, r2, lr, r1 +; CHECK-NEXT: mla lr, r3, r0, r1 +; CHECK-NEXT: vmov r0, r2, d2 +; CHECK-NEXT: vmov r3, r1, d0 +; CHECK-NEXT: umull r4, r5, r3, r0 +; CHECK-NEXT: mla r2, r3, r2, r5 +; CHECK-NEXT: mla r1, r1, r0, r2 +; CHECK-NEXT: adds.w r0, r4, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %m = mul <2 x i64> %x, %y %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m) @@ -1245,18 +1115,12 @@ ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov lr, s4 -; CHECK-NEXT: umull r12, r3, r3, r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: umull r2, lr, r2, lr -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 -; CHECK-NEXT: vmov q0[3], q0[1], lr, r3 -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vmov r2, lr, d0 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: vmov r12, s4 +; CHECK-NEXT: umull r2, lr, r3, r2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: umlal r2, lr, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -1276,20 +1140,14 @@ ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: smull r12, r3, r3, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: sxth.w lr, r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: smull r2, lr, r2, lr -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 -; CHECK-NEXT: vmov q0[3], q0[1], lr, r3 -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vmov r2, lr, d0 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: smull r2, r12, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: sxth.w lr, r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: smlal r2, r12, r3, lr ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc.w r1, r1, r12 ; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <2 x i16> %x to <2 x i64> @@ -1487,183 +1345,141 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.u8 r2, q1[3] -; CHECK-NEXT: vmov.u8 r3, q1[2] +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.u8 r2, q1[1] +; CHECK-NEXT: vmov.u8 r3, q1[0] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: vmov.u8 r3, q0[1] +; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r12, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov.u8 r4, q0[0] -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: umull lr, r12, r2, r12 +; CHECK-NEXT: vmov r12, s12 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: umull r2, r3, r2, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r2, lr -; CHECK-NEXT: vmov.u8 r2, q1[0] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r12 -; CHECK-NEXT: vmov.u8 r3, q1[1] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 -; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov q5[2], q5[0], r4, r3 +; CHECK-NEXT: vmov.u8 r4, q0[2] +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: umull lr, r12, r2, r12 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: smlabb lr, r2, r3, lr +; CHECK-NEXT: vmov.u8 r3, q1[3] +; CHECK-NEXT: vmov.u8 r2, q1[2] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vand q5, q5, q2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s16 ; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: vmov lr, r12, d6 ; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: smlabb r2, r5, r4, r2 -; CHECK-NEXT: vmov r5, r4, d7 ; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: vmov.u8 lr, q0[5] ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[5] -; CHECK-NEXT: adc.w lr, r3, r4 +; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: umlal r2, r3, r4, r12 +; CHECK-NEXT: vmov.u8 r12, q1[5] ; CHECK-NEXT: vmov.u8 r4, q1[4] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov q3[2], q3[0], r4, r12 +; CHECK-NEXT: vmov.u8 r4, q0[4] +; CHECK-NEXT: vmov q4[2], q4[0], r4, lr ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 -; CHECK-NEXT: vmov r5, s14 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov r5, r4, d7 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.u8 r5, q1[7] +; CHECK-NEXT: vmov r12, s12 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov.u8 lr, q0[7] +; CHECK-NEXT: umlal r2, r3, r4, r12 +; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: umull r4, r12, r4, r12 +; CHECK-NEXT: adds r2, r2, r4 ; CHECK-NEXT: vmov.u8 r4, q1[6] -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[7] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: vmov.u8 r12, q1[7] +; CHECK-NEXT: vmov q3[2], q3[0], r4, r12 +; CHECK-NEXT: vmov.u8 r4, q0[6] +; CHECK-NEXT: vmov q4[2], q4[0], r4, lr ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov r5, r4, d7 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.u8 r5, q1[9] +; CHECK-NEXT: vmov r12, s12 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov.u8 lr, q0[9] +; CHECK-NEXT: umlal r2, r3, r4, r12 +; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: umull r4, r12, r4, r12 +; CHECK-NEXT: adds r2, r2, r4 ; CHECK-NEXT: vmov.u8 r4, q1[8] -; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: vmov.u8 r12, q1[9] +; CHECK-NEXT: vmov q3[2], q3[0], r4, r12 +; CHECK-NEXT: vmov.u8 r4, q0[8] +; CHECK-NEXT: vmov q4[2], q4[0], r4, lr ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov r5, r4, d7 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.u8 r5, q1[11] +; CHECK-NEXT: vmov r12, s12 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov.u8 lr, q0[11] +; CHECK-NEXT: umlal r2, r3, r4, r12 +; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: umull r4, r12, r4, r12 +; CHECK-NEXT: adds r2, r2, r4 ; CHECK-NEXT: vmov.u8 r4, q1[10] -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: vmov.u8 r12, q1[11] +; CHECK-NEXT: vmov q3[2], q3[0], r4, r12 +; CHECK-NEXT: vmov.u8 r4, q0[10] +; CHECK-NEXT: vmov q4[2], q4[0], r4, lr ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov r5, r4, d7 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.u8 r5, q1[13] +; CHECK-NEXT: vmov r12, s12 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov.u8 lr, q0[13] +; CHECK-NEXT: umlal r2, r3, r4, r12 +; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: umull r4, r12, r4, r12 +; CHECK-NEXT: adds r2, r2, r4 ; CHECK-NEXT: vmov.u8 r4, q1[12] -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: vmov q4[2], q4[0], r2, r4 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: vmov.u8 r12, q1[13] +; CHECK-NEXT: vmov q3[2], q3[0], r4, r12 +; CHECK-NEXT: vmov.u8 r4, q0[12] +; CHECK-NEXT: vmov q4[2], q4[0], r4, lr ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov r2, r3, d6 -; CHECK-NEXT: vmov r5, r4, d7 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.u8 r5, q1[15] +; CHECK-NEXT: vmov r12, s12 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov.u8 lr, q0[15] +; CHECK-NEXT: umlal r2, r3, r4, r12 +; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: umull r4, r12, r4, r12 +; CHECK-NEXT: adds r2, r2, r4 ; CHECK-NEXT: vmov.u8 r4, q1[14] -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: vmov q1[2], q1[0], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: vmov.u8 r12, q1[15] +; CHECK-NEXT: vmov q1[2], q1[0], r4, r12 +; CHECK-NEXT: vmov.u8 r4, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r4, lr ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r2, r5, r2, r5 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r12, s4 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: umlal r2, r3, r4, r12 +; CHECK-NEXT: vmov r12, s6 ; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: umull r4, r12, r4, r12 ; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -1676,121 +1492,59 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.s8 r2, q1[1] ; CHECK-NEXT: vmov.s8 r3, q0[1] -; CHECK-NEXT: smull lr, r12, r3, r2 -; CHECK-NEXT: vmov.s8 r3, q1[0] +; CHECK-NEXT: smull r3, lr, r3, r2 +; CHECK-NEXT: vmov.s8 r12, q1[0] ; CHECK-NEXT: vmov.s8 r2, q0[0] -; CHECK-NEXT: vmov.s8 r4, q1[2] -; CHECK-NEXT: vmov.s8 r5, q0[2] -; CHECK-NEXT: smull r2, r3, r2, r3 -; CHECK-NEXT: vmov q2[2], q2[0], r2, lr -; CHECK-NEXT: smull r4, r5, r5, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r12 -; CHECK-NEXT: vmov lr, r12, d5 -; CHECK-NEXT: vmov r3, r2, d4 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.s8 r3, q0[3] -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov.s8 r2, q1[3] -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r3 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.s8 r5, q1[5] -; CHECK-NEXT: vmov.s8 r4, q0[5] -; CHECK-NEXT: vmov.s8 r2, q1[4] -; CHECK-NEXT: vmov.s8 r3, q0[4] -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.s8 r5, q1[7] -; CHECK-NEXT: vmov.s8 r4, q0[7] -; CHECK-NEXT: vmov.s8 r2, q1[6] -; CHECK-NEXT: vmov.s8 r3, q0[6] -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.s8 r5, q1[9] -; CHECK-NEXT: vmov.s8 r4, q0[9] -; CHECK-NEXT: vmov.s8 r2, q1[8] -; CHECK-NEXT: vmov.s8 r3, q0[8] -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.s8 r5, q1[11] -; CHECK-NEXT: vmov.s8 r4, q0[11] -; CHECK-NEXT: vmov.s8 r2, q1[10] -; CHECK-NEXT: vmov.s8 r3, q0[10] -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.s8 r5, q1[13] -; CHECK-NEXT: vmov.s8 r4, q0[13] -; CHECK-NEXT: vmov.s8 r2, q1[12] -; CHECK-NEXT: vmov.s8 r3, q0[12] -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: adc.w lr, r3, r4 -; CHECK-NEXT: vmov.s8 r5, q1[15] -; CHECK-NEXT: vmov.s8 r4, q0[15] -; CHECK-NEXT: vmov.s8 r2, q1[14] -; CHECK-NEXT: vmov.s8 r3, q0[14] -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[2] +; CHECK-NEXT: vmov.s8 r2, q0[2] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[3] +; CHECK-NEXT: vmov.s8 r2, q0[3] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[4] +; CHECK-NEXT: vmov.s8 r2, q0[4] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[5] +; CHECK-NEXT: vmov.s8 r2, q0[5] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[6] +; CHECK-NEXT: vmov.s8 r2, q0[6] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[7] +; CHECK-NEXT: vmov.s8 r2, q0[7] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[8] +; CHECK-NEXT: vmov.s8 r2, q0[8] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[9] +; CHECK-NEXT: vmov.s8 r2, q0[9] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[10] +; CHECK-NEXT: vmov.s8 r2, q0[10] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[11] +; CHECK-NEXT: vmov.s8 r2, q0[11] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[12] +; CHECK-NEXT: vmov.s8 r2, q0[12] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[13] +; CHECK-NEXT: vmov.s8 r2, q0[13] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[14] +; CHECK-NEXT: vmov.s8 r2, q0[14] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: vmov.s8 r12, q1[15] +; CHECK-NEXT: vmov.s8 r2, q0[15] +; CHECK-NEXT: smlal r3, lr, r2, r12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> %yy = sext <16 x i8> %y to <16 x i64> @@ -1837,20 +1591,14 @@ ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r12, r3, r3, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: sxtb.w lr, r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: smull r2, lr, r2, lr -; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 -; CHECK-NEXT: vmov q0[3], q0[1], lr, r3 -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vmov r2, lr, d0 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: smull r2, r12, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: sxtb.w lr, r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r2, r12, r3, lr ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc.w r1, r1, r12 ; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <2 x i8> %x to <2 x i64> @@ -1864,27 +1612,23 @@ define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, i64 %a) { ; CHECK-LABEL: add_v2i64_v2i64_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: vmov r2, r12, d3 ; CHECK-NEXT: vmov r3, lr, d1 -; CHECK-NEXT: vmov r6, r9, d2 -; CHECK-NEXT: vmov r5, r11, d0 -; CHECK-NEXT: umull r10, r8, r3, r2 -; CHECK-NEXT: umull r4, r7, r5, r6 -; CHECK-NEXT: mla r3, r3, r12, r8 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r10 -; CHECK-NEXT: mla r2, lr, r2, r3 -; CHECK-NEXT: mla r3, r5, r9, r7 -; CHECK-NEXT: mla r3, r11, r6, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r7, r6, d0 -; CHECK-NEXT: adds r3, r3, r7 -; CHECK-NEXT: adcs r2, r6 +; CHECK-NEXT: vmov r4, r6, d0 +; CHECK-NEXT: umull r8, r5, r3, r2 +; CHECK-NEXT: mla r3, r3, r12, r5 +; CHECK-NEXT: mla r12, lr, r2, r3 +; CHECK-NEXT: vmov r3, r5, d2 +; CHECK-NEXT: umull r7, r2, r4, r3 +; CHECK-NEXT: mla r2, r4, r5, r2 +; CHECK-NEXT: mla r2, r6, r3, r2 +; CHECK-NEXT: adds.w r3, r7, r8 +; CHECK-NEXT: adc.w r2, r2, r12 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %m = mul <2 x i64> %x, %y %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)