diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13066,11 +13066,67 @@ static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { - if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64) + if (!Subtarget->hasMVEIntegerOps()) return SDValue(); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + SDLoc dl(N); + + auto IsVecReduce = [](SDValue Op) { + switch (Op.getOpcode()) { + case ISD::VECREDUCE_ADD: + case ARMISD::VADDVs: + case ARMISD::VADDVu: + case ARMISD::VMLAVs: + case ARMISD::VMLAVu: + return true; + } + return false; + }; + + auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) { + // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) -> + // add(add(X, vecreduce(Y)), vecreduce(Z)) + // to make better use of vaddva style instructions. + if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) && + IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1))) { + SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0)); + return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1)); + } + // And turn add(add(A, reduce(B)), add(C, reduce(D))) -> + // add(add(add(A, C), reduce(B)), reduce(D)) + if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD && + N1.getOpcode() == ISD::ADD) { + unsigned N0RedOp = 0; + if (!IsVecReduce(N0.getOperand(N0RedOp))) { + N0RedOp = 1; + if (!IsVecReduce(N0.getOperand(N0RedOp))) + return SDValue(); + } + + unsigned N1RedOp = 0; + if (!IsVecReduce(N1.getOperand(N1RedOp))) + N1RedOp = 1; + if (!IsVecReduce(N1.getOperand(N1RedOp))) + return SDValue(); + + SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp), + N1.getOperand(1 - N1RedOp)); + SDValue Add1 = + DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp)); + return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp)); + } + return SDValue(); + }; + if (SDValue R = DistrubuteAddAddVecReduce(N0, N1)) + return R; + if (SDValue R = DistrubuteAddAddVecReduce(N1, N0)) + return R; + + if (VT != MVT::i64) + return SDValue(); // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this // will look like: @@ -13090,7 +13146,6 @@ NB->getOperand(1) != SDValue(VecRed.getNode(), 1)) return SDValue(); - SDLoc dl(N); if (VecRed->getOpcode() == OpcodeA) { // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y) SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, diff --git a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll --- a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll @@ -117,9 +117,8 @@ define arm_aapcs_vfpcc i32 @vaddva_v8i32_i32(<8 x i32> %s1, i32 %x) { ; CHECK-LABEL: vaddva_v8i32_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vaddva.u32 r0, q1 ; CHECK-NEXT: bx lr entry: %t = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1) @@ -141,9 +140,8 @@ define arm_aapcs_vfpcc i16 @vaddva_v16i16_i16(<16 x i16> %s1, i16 %x) { ; CHECK-LABEL: vaddva_v16i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vaddv.u16 r2, q1 -; CHECK-NEXT: vaddva.u16 r2, q0 -; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: vaddva.u16 r0, q1 ; CHECK-NEXT: bx lr entry: %t = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1) @@ -165,9 +163,8 @@ define arm_aapcs_vfpcc i8 @vaddva_v32i8_i8(<32 x i8> %s1, i8 %x) { ; CHECK-LABEL: vaddva_v32i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vaddv.u8 r2, q1 -; CHECK-NEXT: vaddva.u8 r2, q0 -; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vaddva.u8 r0, q0 +; CHECK-NEXT: vaddva.u8 r0, q1 ; CHECK-NEXT: bx lr entry: %t = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll @@ -49,15 +49,15 @@ define i32 @addv16i32i32(i32* %x) { ; CHECK-LABEL: addv16i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vaddv.u32 r0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <16 x i32>* @@ -69,20 +69,19 @@ define i32 @addv24i32i32(i32* %x) { ; CHECK-LABEL: addv24i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vaddva.u32 r12, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vaddv.u32 r0, q1 -; CHECK-NEXT: add.w r1, r2, r12 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <8 x i32>* @@ -99,25 +98,23 @@ define i32 @addv32i32i32(i32* %x) { ; CHECK-LABEL: addv32i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #112] -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vaddva.u32 r12, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: add.w r1, r2, r12 -; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: vaddv.u32 r0, q1 ; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r1, #96] +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r1, #32] +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r1, #48] +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r1, #64] +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r1, #112] +; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <32 x i32>* @@ -129,45 +126,39 @@ define i32 @addv64i32i32(i32* %x) { ; CHECK-LABEL: addv64i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #240] -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #176] -; CHECK-NEXT: vaddva.u32 r12, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #208] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #208] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: add.w r1, r2, r12 -; CHECK-NEXT: vaddv.u32 r12, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #144] -; CHECK-NEXT: vaddva.u32 r12, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #224] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #112] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #224] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #96] -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: vaddv.u32 r12, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #160] -; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: add r1, r2 -; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #192] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: add.w r3, r2, r12 -; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #128] +; CHECK-NEXT: vldrw.u32 q0, [r0, #160] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #144] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vaddv.u32 r0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: add r0, r3 -; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #128] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #192] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #240] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <64 x i32>* @@ -455,15 +446,15 @@ define i32 @addv16i32i16(i16* %x) { ; CHECK-LABEL: addv16i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q1, [r0, #24] -; CHECK-NEXT: vldrh.s32 q0, [r0, #8] -; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrh.s32 q1, [r0, #16] -; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0] -; CHECK-NEXT: vaddv.u32 r0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #24] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <16 x i16>* @@ -476,16 +467,15 @@ define i32 @addv24i32i16(i16* %x) { ; CHECK-LABEL: addv24i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q1, [r0, #24] -; CHECK-NEXT: vldrh.s32 q0, [r0, #8] -; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrh.s32 q1, [r0, #16] -; CHECK-NEXT: vaddva.u32 r12, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #24] +; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] -; CHECK-NEXT: add r2, r12 ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr @@ -506,25 +496,23 @@ define i32 @addv32i32i16(i16* %x) { ; CHECK-LABEL: addv32i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q1, [r0, #56] -; CHECK-NEXT: vldrh.s32 q0, [r0, #24] -; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrh.s32 q1, [r0, #40] -; CHECK-NEXT: vaddva.u32 r12, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrh.s32 q1, [r0, #48] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #48] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] -; CHECK-NEXT: add.w r1, r2, r12 -; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrh.s32 q1, [r0, #32] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0] -; CHECK-NEXT: vaddv.u32 r0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #32] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #56] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <32 x i16>* @@ -537,39 +525,34 @@ define i32 @addv64i32i16(i16* %x) { ; CHECK-LABEL: addv64i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q1, [r0, #56] -; CHECK-NEXT: vldrh.s32 q0, [r0, #24] -; CHECK-NEXT: ldrsh.w r3, [r0, #122] -; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrh.s32 q1, [r0, #40] -; CHECK-NEXT: vaddva.u32 r12, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: ldrsh.w r1, [r0, #120] ; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrh.s32 q1, [r0, #48] +; CHECK-NEXT: ldrsh.w r3, [r0, #122] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #48] +; CHECK-NEXT: ldrsh.w r12, [r0, #124] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] -; CHECK-NEXT: add.w r1, r2, r12 -; CHECK-NEXT: vaddv.u32 r12, q1 -; CHECK-NEXT: vldrh.s32 q1, [r0, #32] -; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #24] +; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0] -; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrh.s32 q1, [r0, #88] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #72] -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: vaddv.u32 r12, q1 -; CHECK-NEXT: vldrh.s32 q1, [r0, #80] -; CHECK-NEXT: vaddva.u32 r12, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #64] -; CHECK-NEXT: add r1, r2 -; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #80] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #32] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #72] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #56] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #88] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: ldrsh.w r12, [r0, #124] -; CHECK-NEXT: add r2, r1 -; CHECK-NEXT: ldrsh.w r1, [r0, #120] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #112] ; CHECK-NEXT: ldrsh.w r0, [r0, #126] @@ -840,25 +823,23 @@ define i32 @addv32i32i8(i8* %x) { ; CHECK-LABEL: addv32i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q1, [r0, #28] -; CHECK-NEXT: vldrb.u32 q0, [r0, #12] -; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrb.u32 q1, [r0, #20] -; CHECK-NEXT: vaddva.u32 r12, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #4] ; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrb.u32 q1, [r0, #24] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #8] -; CHECK-NEXT: add.w r1, r2, r12 -; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrb.u32 q1, [r0, #16] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #12] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0] -; CHECK-NEXT: vaddv.u32 r0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #16] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #28] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i8* %x to <32 x i8>* @@ -871,29 +852,26 @@ define i32 @addv64i32i8(i8* %x) { ; CHECK-LABEL: addv64i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q1, [r0, #28] -; CHECK-NEXT: vldrb.u32 q0, [r0, #12] -; CHECK-NEXT: ldrb.w r3, [r0, #61] -; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrb.u32 q1, [r0, #20] -; CHECK-NEXT: vaddva.u32 r12, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: ldrb.w r1, [r0, #60] ; CHECK-NEXT: vaddv.u32 r2, q1 -; CHECK-NEXT: vldrb.u32 q1, [r0, #24] +; CHECK-NEXT: ldrb.w r3, [r0, #61] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #24] +; CHECK-NEXT: ldrb.w r12, [r0, #62] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #8] -; CHECK-NEXT: add.w r1, r2, r12 -; CHECK-NEXT: vaddv.u32 r12, q1 -; CHECK-NEXT: vldrb.u32 q1, [r0, #16] -; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #12] +; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0] -; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #16] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #28] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: ldrb.w r12, [r0, #62] -; CHECK-NEXT: add r2, r1 -; CHECK-NEXT: ldrb.w r1, [r0, #60] ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u16 q0, [r0, #48] ; CHECK-NEXT: vaddva.u16 r2, q0 @@ -1104,16 +1082,15 @@ define signext i16 @addv32i16i16(i16* %x) { ; CHECK-LABEL: addv32i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q1, [r0, #48] -; CHECK-NEXT: vldrh.u16 q0, [r0, #16] -; CHECK-NEXT: vaddv.u16 r2, q1 ; CHECK-NEXT: vldrh.u16 q1, [r0, #32] -; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vaddv.u16 r0, q1 -; CHECK-NEXT: vaddva.u16 r0, q0 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vaddv.u16 r2, q1 +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #48] +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <32 x i16>* @@ -1125,26 +1102,23 @@ define signext i16 @addv64i16i16(i16* %x) { ; CHECK-LABEL: addv64i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q1, [r0, #112] -; CHECK-NEXT: vldrh.u16 q0, [r0, #48] -; CHECK-NEXT: vaddv.u16 r12, q1 ; CHECK-NEXT: vldrh.u16 q1, [r0, #80] -; CHECK-NEXT: vaddva.u16 r12, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.u16 r2, q1 -; CHECK-NEXT: vldrh.u16 q1, [r0, #96] +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #96] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] -; CHECK-NEXT: add.w r1, r2, r12 -; CHECK-NEXT: vaddv.u16 r2, q1 -; CHECK-NEXT: vldrh.u16 q1, [r0, #64] +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vaddv.u16 r0, q1 -; CHECK-NEXT: vaddva.u16 r0, q0 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #64] +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #112] +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <64 x i16>* @@ -1370,16 +1344,15 @@ define zeroext i8 @addv64i8i8(i8* %x) { ; CHECK-LABEL: addv64i8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q1, [r0, #48] -; CHECK-NEXT: vldrb.u8 q0, [r0, #16] -; CHECK-NEXT: vaddv.u8 r2, q1 ; CHECK-NEXT: vldrb.u8 q1, [r0, #32] -; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: vaddv.u8 r0, q1 -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: vaddv.u8 r2, q1 +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0, #48] +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: uxtb r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i8* %x to <64 x i8>* @@ -1515,19 +1488,19 @@ define i32 @mlav16i32i32(i32* %x, i32* %y) { ; CHECK-LABEL: mlav16i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r1, #48] -; CHECK-NEXT: vmlav.u32 r12, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmlava.u32 r12, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] ; CHECK-NEXT: vmlav.u32 r2, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 -; CHECK-NEXT: add.w r0, r2, r12 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r1, #48] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <16 x i32>* @@ -1542,26 +1515,25 @@ define i32 @mlav24i32i32(i32* %x, i32* %y) { ; CHECK-LABEL: mlav24i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q1, [r1, #80] -; CHECK-NEXT: vmlav.u32 r12, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r1, #48] -; CHECK-NEXT: vmlava.u32 r12, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] -; CHECK-NEXT: vmlav.u32 r2, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: vmlav.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] -; CHECK-NEXT: vmlava.u32 r2, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: add.w r3, r2, r12 -; CHECK-NEXT: vmlav.u32 r2, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #48] +; CHECK-NEXT: vldrw.u32 q1, [r1, #48] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #80] +; CHECK-NEXT: vldrw.u32 q1, [r1, #80] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmlava.u32 r2, q1, q0 -; CHECK-NEXT: adds r0, r3, r2 +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #16] +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <8 x i32>* @@ -1585,33 +1557,31 @@ define i32 @mlav32i32i32(i32* %x, i32* %y) { ; CHECK-LABEL: mlav32i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vldrw.u32 q1, [r1, #112] -; CHECK-NEXT: vmlav.u32 r12, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r1, #48] -; CHECK-NEXT: vmlava.u32 r12, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] -; CHECK-NEXT: vmlav.u32 r2, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: vmlav.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmlava.u32 r2, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #96] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #96] ; CHECK-NEXT: vldrw.u32 q1, [r1, #96] -; CHECK-NEXT: add.w r3, r2, r12 -; CHECK-NEXT: vmlav.u32 r12, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] -; CHECK-NEXT: vmlava.u32 r12, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r1, #64] -; CHECK-NEXT: vmlav.u32 r2, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #48] +; CHECK-NEXT: vldrw.u32 q1, [r1, #48] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmlava.u32 r2, q1, q0 -; CHECK-NEXT: add.w r0, r2, r12 -; CHECK-NEXT: add r0, r3 +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #64] +; CHECK-NEXT: vldrw.u32 q1, [r1, #64] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #112] +; CHECK-NEXT: vldrw.u32 q1, [r1, #112] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <32 x i32>* @@ -2279,19 +2249,19 @@ define i32 @mlav16i32i16(i16* %x, i16* %y) { ; CHECK-LABEL: mlav16i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r0, #24] -; CHECK-NEXT: vldrh.s32 q1, [r1, #24] -; CHECK-NEXT: vmlav.u32 r12, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #8] -; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vmlava.u32 r12, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] ; CHECK-NEXT: vmlav.u32 r2, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q1, [r1] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 -; CHECK-NEXT: add.w r0, r2, r12 +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #24] +; CHECK-NEXT: vldrh.s32 q1, [r1, #24] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <16 x i16>* @@ -2308,23 +2278,22 @@ define i32 @mlav24i32i16(i16* %x, i16* %y) { ; CHECK-LABEL: mlav24i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r0, #40] -; CHECK-NEXT: vldrh.s32 q1, [r1, #40] -; CHECK-NEXT: vmlav.u32 r12, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #24] -; CHECK-NEXT: vldrh.s32 q1, [r1, #24] -; CHECK-NEXT: vmlava.u32 r12, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #32] ; CHECK-NEXT: vldrh.s32 q1, [r1, #32] -; CHECK-NEXT: vmlav.u32 r2, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #16] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: vmlav.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #16] ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] -; CHECK-NEXT: vmlava.u32 r2, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #24] +; CHECK-NEXT: vldrh.s32 q1, [r1, #24] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #40] +; CHECK-NEXT: vldrh.s32 q1, [r1, #40] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r2] ; CHECK-NEXT: vldrh.u16 q1, [r1] -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: vmlava.s16 r2, q1, q0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <8 x i16>* @@ -2352,33 +2321,31 @@ define i32 @mlav32i32i16(i16* %x, i16* %y) { ; CHECK-LABEL: mlav32i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r0, #56] -; CHECK-NEXT: vldrh.s32 q1, [r1, #56] -; CHECK-NEXT: vmlav.u32 r12, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #24] -; CHECK-NEXT: vldrh.s32 q1, [r1, #24] -; CHECK-NEXT: vmlava.u32 r12, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #40] ; CHECK-NEXT: vldrh.s32 q1, [r1, #40] -; CHECK-NEXT: vmlav.u32 r2, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: vmlav.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #8] ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vmlava.u32 r2, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #48] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #48] ; CHECK-NEXT: vldrh.s32 q1, [r1, #48] -; CHECK-NEXT: add.w r3, r2, r12 -; CHECK-NEXT: vmlav.u32 r12, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #16] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #16] ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] -; CHECK-NEXT: vmlava.u32 r12, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #32] -; CHECK-NEXT: vldrh.s32 q1, [r1, #32] -; CHECK-NEXT: vmlav.u32 r2, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #24] +; CHECK-NEXT: vldrh.s32 q1, [r1, #24] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2] ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vmlava.u32 r2, q1, q0 -; CHECK-NEXT: add.w r0, r2, r12 -; CHECK-NEXT: add r0, r3 +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #32] +; CHECK-NEXT: vldrh.s32 q1, [r1, #32] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #56] +; CHECK-NEXT: vldrh.s32 q1, [r1, #56] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <32 x i16>* @@ -2861,33 +2828,31 @@ define i32 @mlav32i32i8(i8* %x, i8* %y) { ; CHECK-LABEL: mlav32i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r0, #28] -; CHECK-NEXT: vldrb.u32 q1, [r1, #28] -; CHECK-NEXT: vmlav.u32 r12, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r0, #12] -; CHECK-NEXT: vldrb.u32 q1, [r1, #12] -; CHECK-NEXT: vmlava.u32 r12, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #20] ; CHECK-NEXT: vldrb.u32 q1, [r1, #20] -; CHECK-NEXT: vmlav.u32 r2, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: vmlav.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #4] ; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vmlava.u32 r2, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r0, #24] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #24] ; CHECK-NEXT: vldrb.u32 q1, [r1, #24] -; CHECK-NEXT: add.w r3, r2, r12 -; CHECK-NEXT: vmlav.u32 r12, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r0, #8] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #8] ; CHECK-NEXT: vldrb.u32 q1, [r1, #8] -; CHECK-NEXT: vmlava.u32 r12, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r0, #16] -; CHECK-NEXT: vldrb.u32 q1, [r1, #16] -; CHECK-NEXT: vmlav.u32 r2, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #12] +; CHECK-NEXT: vldrb.u32 q1, [r1, #12] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2] ; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vmlava.u32 r2, q1, q0 -; CHECK-NEXT: add.w r0, r2, r12 -; CHECK-NEXT: add r0, r3 +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #16] +; CHECK-NEXT: vldrb.u32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #28] +; CHECK-NEXT: vldrb.u32 q1, [r1, #28] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = bitcast i8* %x to <32 x i8>* @@ -3198,20 +3163,19 @@ define signext i16 @mlav32i16i16(i16* %x, i16* %y) { ; CHECK-LABEL: mlav32i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r0, #48] -; CHECK-NEXT: vldrh.u16 q1, [r1, #48] -; CHECK-NEXT: vmlav.u16 r12, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r0, #16] -; CHECK-NEXT: vldrh.u16 q1, [r1, #16] -; CHECK-NEXT: vmlava.u16 r12, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vldrh.u16 q1, [r1, #32] ; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 -; CHECK-NEXT: add.w r0, r2, r12 -; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] +; CHECK-NEXT: vldrh.u16 q1, [r1, #16] +; CHECK-NEXT: vmlava.u16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #48] +; CHECK-NEXT: vldrh.u16 q1, [r1, #48] +; CHECK-NEXT: vmlava.u16 r2, q1, q0 +; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <32 x i16>*