diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15976,6 +15976,15 @@ SDValue N0 = N->getOperand(0); SDLoc dl(N); + // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y) + if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD && + (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 || + N0.getValueType() == MVT::v16i8)) { + SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0)); + SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1)); + return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1); + } + // We are looking for something that will have illegal types if left alone, // but that we can convert to a single instruction under MVE. For example // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A diff --git a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll --- a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll @@ -35,8 +35,8 @@ define arm_aapcs_vfpcc i32 @vaddv_v8i32_i32(<8 x i32> %s1) { ; CHECK-LABEL: vaddv_v8i32_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: bx lr entry: %r = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1) @@ -56,8 +56,8 @@ define arm_aapcs_vfpcc i16 @vaddv_v16i16_i16(<16 x i16> %s1) { ; CHECK-LABEL: vaddv_v16i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: vaddv.u16 r0, q1 +; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: bx lr entry: %r = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1) @@ -77,8 +77,8 @@ define arm_aapcs_vfpcc i8 @vaddv_v32i8_i8(<32 x i8> %s1) { ; CHECK-LABEL: vaddv_v32i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: vaddv.u8 r0, q1 +; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: bx lr entry: %r = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1) @@ -117,8 +117,9 @@ define arm_aapcs_vfpcc i32 @vaddva_v8i32_i32(<8 x i32> %s1, i32 %x) { ; CHECK-LABEL: vaddva_v8i32_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: bx lr entry: %t = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1) @@ -140,8 +141,9 @@ define arm_aapcs_vfpcc i16 @vaddva_v16i16_i16(<16 x i16> %s1, i16 %x) { ; CHECK-LABEL: vaddva_v16i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: vaddv.u16 r2, q1 +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: bx lr entry: %t = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1) @@ -163,8 +165,9 @@ define arm_aapcs_vfpcc i8 @vaddva_v32i8_i8(<32 x i8> %s1, i8 %x) { ; CHECK-LABEL: vaddva_v32i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vaddva.u8 r0, q0 +; CHECK-NEXT: vaddv.u8 r2, q1 +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: bx lr entry: %t = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1) diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -522,13 +522,11 @@ ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vldrb.u16 q0, [r0, #8] ; CHECK-NEXT: vldrb.s16 q1, [r1, #8] -; CHECK-NEXT: vldrb.s16 q2, [r1] -; CHECK-NEXT: vmul.i16 q0, q1, q0 -; CHECK-NEXT: vldrb.u16 q1, [r0] -; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vaddv.u16 r0, q0 -; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vmlav.u16 r2, q1, q0 +; CHECK-NEXT: vldrb.u16 q0, [r0] +; CHECK-NEXT: vldrb.s16 q1, [r1] +; CHECK-NEXT: vmlava.u16 r2, q1, q0 +; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -703,53 +703,50 @@ ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q1, #0xff -; CHECK-NEXT: vldrb.u16 q2, [r0] +; CHECK-NEXT: vldrb.s16 q2, [r1, #8] ; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: vldrb.s16 q3, [r1] -; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: vmov.u8 r2, q0[9] ; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[12] ; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.u8 r2, q0[13] ; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: vmov.u8 r2, q0[14] ; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: vcmp.i16 ne, q1, zr -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vldrb.u16 q1, [r0, #8] ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.i16 q1, q3, q2 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: vmov.16 q2[6], r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vldrb.u16 q0, [r0, #8] -; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vldrb.s16 q2, [r1, #8] -; CHECK-NEXT: vmul.i16 q0, q2, q0 +; CHECK-NEXT: vmlavt.u16 r2, q2, q1 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[1] +; CHECK-NEXT: vmov.16 q1[1], r3 +; CHECK-NEXT: vmov.u8 r3, q0[2] +; CHECK-NEXT: vmov.16 q1[2], r3 +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: vmov.16 q1[3], r3 +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vmov.16 q1[4], r3 +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: vmov.16 q1[6], r3 +; CHECK-NEXT: vmov.u8 r3, q0[7] +; CHECK-NEXT: vmov.16 q1[7], r3 +; CHECK-NEXT: vldrb.u16 q0, [r0] +; CHECK-NEXT: vcmp.i16 ne, q1, zr +; CHECK-NEXT: vldrb.s16 q1, [r1] ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.i16 q1, q1, q0 -; CHECK-NEXT: vaddv.u16 r0, q1 -; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vmlavat.u16 r2, q1, q0 +; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll @@ -34,10 +34,10 @@ define i32 @addv8i32i32(i32* %x) { ; CHECK-LABEL: addv8i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <8 x i32>* @@ -49,14 +49,15 @@ define i32 @addv16i32i32(i32* %x) { ; CHECK-LABEL: addv16i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <16 x i32>* @@ -68,18 +69,20 @@ define i32 @addv24i32i32(i32* %x) { ; CHECK-LABEL: addv24i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: add.w r1, r2, r12 ; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <8 x i32>* @@ -96,22 +99,25 @@ define i32 @addv32i32i32(i32* %x) { ; CHECK-LABEL: addv32i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0, #112] +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: add.w r1, r2, r12 +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <32 x i32>* @@ -123,41 +129,45 @@ define i32 @addv64i32i32(i32* %x) { ; CHECK-LABEL: addv64i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #240] -; CHECK-NEXT: vldrw.u32 q1, [r0, #112] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0, #240] +; CHECK-NEXT: vldrw.u32 q0, [r0, #112] +; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #176] -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #208] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0, #144] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: add.w r1, r2, r12 +; CHECK-NEXT: vaddv.u32 r12, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #144] +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrw.u32 q1, [r0, #224] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0, #192] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #128] -; CHECK-NEXT: vadd.i32 q3, q4, q3 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #96] +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: vaddv.u32 r12, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #160] +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #192] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: add.w r3, r2, r12 +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #128] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: add r0, r3 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <64 x i32>* @@ -445,14 +455,15 @@ define i32 @addv16i32i16(i16* %x) { ; CHECK-LABEL: addv16i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r0, #24] -; CHECK-NEXT: vldrh.s32 q1, [r0, #8] -; CHECK-NEXT: vldrh.s32 q2, [r0] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrh.s32 q1, [r0, #24] +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrh.s32 q1, [r0, #16] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <16 x i16>* @@ -465,16 +476,18 @@ define i32 @addv24i32i16(i16* %x) { ; CHECK-LABEL: addv24i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r0, #24] -; CHECK-NEXT: vldrh.s32 q1, [r0, #8] -; CHECK-NEXT: vldrh.s32 q2, [r0] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrh.s32 q1, [r0, #24] +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrh.s32 q1, [r0, #16] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0, #32] -; CHECK-NEXT: vaddv.s16 r0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #32] +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: vaddva.s16 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <16 x i16>* @@ -493,22 +506,25 @@ define i32 @addv32i32i16(i16* %x) { ; CHECK-LABEL: addv32i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r0, #56] -; CHECK-NEXT: vldrh.s32 q1, [r0, #24] -; CHECK-NEXT: vldrh.s32 q2, [r0, #8] -; CHECK-NEXT: vldrh.s32 q3, [r0] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrh.s32 q1, [r0, #56] +; CHECK-NEXT: vldrh.s32 q0, [r0, #24] +; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrh.s32 q1, [r0, #40] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r0, #16] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrh.s32 q1, [r0, #48] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r0, #32] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #16] +; CHECK-NEXT: add.w r1, r2, r12 +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vldrh.s32 q1, [r0, #32] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <32 x i16>* @@ -521,34 +537,39 @@ define i32 @addv64i32i16(i16* %x) { ; CHECK-LABEL: addv64i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r0, #56] -; CHECK-NEXT: vldrh.s32 q1, [r0, #24] -; CHECK-NEXT: vldrh.s32 q2, [r0, #8] -; CHECK-NEXT: vldrh.s32 q3, [r0] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrh.s32 q1, [r0, #56] +; CHECK-NEXT: vldrh.s32 q0, [r0, #24] +; CHECK-NEXT: ldrsh.w r3, [r0, #122] +; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrh.s32 q1, [r0, #40] -; CHECK-NEXT: ldrsh.w r1, [r0, #120] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r0, #16] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrh.s32 q1, [r0, #48] -; CHECK-NEXT: ldrsh.w r3, [r0, #122] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r0, #32] -; CHECK-NEXT: ldrsh.w r12, [r0, #124] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vldrh.s32 q3, [r0, #64] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r0, #72] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #16] +; CHECK-NEXT: add.w r1, r2, r12 +; CHECK-NEXT: vaddv.u32 r12, q1 +; CHECK-NEXT: vldrh.s32 q1, [r0, #32] +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrh.s32 q1, [r0, #88] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r0, #80] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #72] +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: vaddv.u32 r12, q1 +; CHECK-NEXT: vldrh.s32 q1, [r0, #80] +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #64] +; CHECK-NEXT: add r1, r2 ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: ldrsh.w r12, [r0, #124] +; CHECK-NEXT: add r2, r1 +; CHECK-NEXT: ldrsh.w r1, [r0, #120] ; CHECK-NEXT: vaddva.s16 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #112] ; CHECK-NEXT: ldrsh.w r0, [r0, #126] @@ -819,22 +840,25 @@ define i32 @addv32i32i8(i8* %x) { ; CHECK-LABEL: addv32i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r0, #28] -; CHECK-NEXT: vldrb.u32 q1, [r0, #12] -; CHECK-NEXT: vldrb.u32 q2, [r0, #4] -; CHECK-NEXT: vldrb.u32 q3, [r0] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrb.u32 q1, [r0, #28] +; CHECK-NEXT: vldrb.u32 q0, [r0, #12] +; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrb.u32 q1, [r0, #20] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.u32 q2, [r0, #8] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrb.u32 q1, [r0, #24] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.u32 q2, [r0, #16] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #8] +; CHECK-NEXT: add.w r1, r2, r12 +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vldrb.u32 q1, [r0, #16] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: bx lr entry: %0 = bitcast i8* %x to <32 x i8>* @@ -847,27 +871,30 @@ define i32 @addv64i32i8(i8* %x) { ; CHECK-LABEL: addv64i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r0, #28] -; CHECK-NEXT: vldrb.u32 q1, [r0, #12] -; CHECK-NEXT: vldrb.u32 q2, [r0, #4] -; CHECK-NEXT: vldrb.u32 q3, [r0] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vldrb.u32 q1, [r0, #28] +; CHECK-NEXT: vldrb.u32 q0, [r0, #12] +; CHECK-NEXT: ldrb.w r3, [r0, #61] +; CHECK-NEXT: vaddv.u32 r12, q1 ; CHECK-NEXT: vldrb.u32 q1, [r0, #20] -; CHECK-NEXT: ldrb.w r1, [r0, #60] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.u32 q2, [r0, #8] -; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vldrb.u32 q1, [r0, #24] -; CHECK-NEXT: ldrb.w r3, [r0, #61] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.u32 q2, [r0, #16] -; CHECK-NEXT: ldrb.w r12, [r0, #62] -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrb.u8 q1, [r0, #32] -; CHECK-NEXT: vaddv.u8 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #8] +; CHECK-NEXT: add.w r1, r2, r12 +; CHECK-NEXT: vaddv.u32 r12, q1 +; CHECK-NEXT: vldrb.u32 q1, [r0, #16] +; CHECK-NEXT: vaddva.u32 r12, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0, #32] +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: ldrb.w r12, [r0, #62] +; CHECK-NEXT: add r2, r1 +; CHECK-NEXT: ldrb.w r1, [r0, #60] +; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u16 q0, [r0, #48] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #56] @@ -1038,10 +1065,10 @@ define signext i16 @addv16i16i16(i16* %x) { ; CHECK-LABEL: addv16i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r0, #16] -; CHECK-NEXT: vldrh.u16 q1, [r0] -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: vldrh.u16 q1, [r0, #16] +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vaddv.u16 r0, q1 +; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr entry: @@ -1054,13 +1081,13 @@ define signext i16 @addv24i16i16(i16* %x) { ; CHECK-LABEL: addv24i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r0, #32] -; CHECK-NEXT: vldrh.u16 q1, [r0, #16] -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0] -; CHECK-NEXT: vaddv.u16 r0, q1 -; CHECK-NEXT: vaddva.u16 r0, q0 -; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vldrh.u16 q1, [r0, #32] +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] +; CHECK-NEXT: vaddv.u16 r2, q1 +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <8 x i16>* @@ -1077,14 +1104,15 @@ define signext i16 @addv32i16i16(i16* %x) { ; CHECK-LABEL: addv32i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r0, #48] -; CHECK-NEXT: vldrh.u16 q1, [r0, #16] -; CHECK-NEXT: vldrh.u16 q2, [r0] -; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vldrh.u16 q1, [r0, #48] +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] +; CHECK-NEXT: vaddv.u16 r2, q1 ; CHECK-NEXT: vldrh.u16 q1, [r0, #32] -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vaddv.u16 r0, q1 +; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr entry: @@ -1097,22 +1125,25 @@ define signext i16 @addv64i16i16(i16* %x) { ; CHECK-LABEL: addv64i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r0, #112] -; CHECK-NEXT: vldrh.u16 q1, [r0, #48] -; CHECK-NEXT: vldrh.u16 q2, [r0, #16] -; CHECK-NEXT: vldrh.u16 q3, [r0] -; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vldrh.u16 q1, [r0, #112] +; CHECK-NEXT: vldrh.u16 q0, [r0, #48] +; CHECK-NEXT: vaddv.u16 r12, q1 ; CHECK-NEXT: vldrh.u16 q1, [r0, #80] -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vldrh.u16 q2, [r0, #32] -; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vaddva.u16 r12, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] +; CHECK-NEXT: vaddv.u16 r2, q1 ; CHECK-NEXT: vldrh.u16 q1, [r0, #96] -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vldrh.u16 q2, [r0, #64] -; CHECK-NEXT: vadd.i16 q2, q3, q2 -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #32] +; CHECK-NEXT: add.w r1, r2, r12 +; CHECK-NEXT: vaddv.u16 r2, q1 +; CHECK-NEXT: vldrh.u16 q1, [r0, #64] +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vaddv.u16 r0, q1 +; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr entry: @@ -1323,10 +1354,10 @@ define zeroext i8 @addv32i8i8(i8* %x) { ; CHECK-LABEL: addv32i8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r0, #16] -; CHECK-NEXT: vldrb.u8 q1, [r0] -; CHECK-NEXT: vadd.i8 q0, q1, q0 -; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: vldrb.u8 q1, [r0, #16] +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vaddv.u8 r0, q1 +; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: @@ -1339,14 +1370,15 @@ define zeroext i8 @addv64i8i8(i8* %x) { ; CHECK-LABEL: addv64i8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r0, #48] -; CHECK-NEXT: vldrb.u8 q1, [r0, #16] -; CHECK-NEXT: vldrb.u8 q2, [r0] -; CHECK-NEXT: vadd.i8 q0, q1, q0 +; CHECK-NEXT: vldrb.u8 q1, [r0, #48] +; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vaddv.u8 r2, q1 ; CHECK-NEXT: vldrb.u8 q1, [r0, #32] -; CHECK-NEXT: vadd.i8 q1, q2, q1 -; CHECK-NEXT: vadd.i8 q0, q1, q0 -; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: vaddva.u8 r2, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vaddv.u8 r0, q1 +; CHECK-NEXT: vaddva.u8 r0, q0 +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: uxtb r0, r0 ; CHECK-NEXT: bx lr entry: @@ -1464,12 +1496,11 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vldrw.u32 q2, [r1] -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <8 x i32>* @@ -1486,20 +1517,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] -; CHECK-NEXT: vldrw.u32 q2, [r1, #16] -; CHECK-NEXT: vldrw.u32 q3, [r1] -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r1, #32] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vmlav.u32 r12, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r12, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r1, #32] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: add.w r0, r2, r12 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <16 x i32>* @@ -1516,28 +1544,24 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] -; CHECK-NEXT: vldrw.u32 q2, [r1, #48] -; CHECK-NEXT: vldrw.u32 q3, [r1, #32] -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r1, #64] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrw.u32 q3, [r1] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r1, #16] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vaddv.u32 r0, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vmlav.u32 r12, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r1, #48] +; CHECK-NEXT: vmlava.u32 r12, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r1, #64] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r1, #32] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] +; CHECK-NEXT: add.w r3, r2, r12 +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: adds r0, r3, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <8 x i32>* @@ -1561,41 +1585,33 @@ define i32 @mlav32i32i32(i32* %x, i32* %y) { ; CHECK-LABEL: mlav32i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q0, [r0, #112] ; CHECK-NEXT: vldrw.u32 q1, [r1, #112] -; CHECK-NEXT: vldrw.u32 q2, [r1, #48] -; CHECK-NEXT: vldrw.u32 q3, [r1, #16] -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r1] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r1, #80] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrw.u32 q3, [r1, #32] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r1, #96] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrw.u32 q3, [r1, #64] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vmul.i32 q3, q4, q3 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmlav.u32 r12, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r1, #48] +; CHECK-NEXT: vmlava.u32 r12, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r1, #80] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #96] +; CHECK-NEXT: vldrw.u32 q1, [r1, #96] +; CHECK-NEXT: add.w r3, r2, r12 +; CHECK-NEXT: vmlav.u32 r12, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r1, #32] +; CHECK-NEXT: vmlava.u32 r12, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r1, #64] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: add.w r0, r2, r12 +; CHECK-NEXT: add r0, r3 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <32 x i32>* @@ -2265,20 +2281,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] -; CHECK-NEXT: vldrh.s32 q2, [r1, #8] -; CHECK-NEXT: vldrh.s32 q3, [r1] -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.s32 q1, [r0, #8] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r1, #16] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.s32 q1, [r0, #16] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r0] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vmlav.u32 r12, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vmlava.u32 r12, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #16] +; CHECK-NEXT: vldrh.s32 q1, [r1, #16] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: add.w r0, r2, r12 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <16 x i16>* @@ -2297,23 +2310,21 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r0, #40] ; CHECK-NEXT: vldrh.s32 q1, [r1, #40] -; CHECK-NEXT: vldrh.s32 q2, [r1, #24] -; CHECK-NEXT: vldrh.s32 q3, [r1, #16] -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.s32 q1, [r0, #24] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r1, #32] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.s32 q1, [r0, #32] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r0, #16] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.u16 q2, [r1] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0] -; CHECK-NEXT: vmlav.s16 r0, q2, q1 -; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vmlav.u32 r12, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #24] +; CHECK-NEXT: vldrh.s32 q1, [r1, #24] +; CHECK-NEXT: vmlava.u32 r12, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #32] +; CHECK-NEXT: vldrh.s32 q1, [r1, #32] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #16] +; CHECK-NEXT: vldrh.s32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: vmlava.s16 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <8 x i16>* @@ -2341,41 +2352,33 @@ define i32 @mlav32i32i16(i16* %x, i16* %y) { ; CHECK-LABEL: mlav32i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrh.s32 q0, [r0, #56] ; CHECK-NEXT: vldrh.s32 q1, [r1, #56] -; CHECK-NEXT: vldrh.s32 q2, [r1, #24] -; CHECK-NEXT: vldrh.s32 q3, [r1, #8] -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.s32 q1, [r0, #24] -; CHECK-NEXT: vldrh.s32 q4, [r1] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r1, #40] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.s32 q1, [r0, #40] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r0, #8] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrh.s32 q3, [r1, #16] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r1, #48] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.s32 q1, [r0, #48] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r0, #16] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrh.s32 q3, [r1, #32] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r0, #32] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrh.s32 q3, [r0] -; CHECK-NEXT: vmul.i32 q3, q4, q3 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmlav.u32 r12, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #24] +; CHECK-NEXT: vldrh.s32 q1, [r1, #24] +; CHECK-NEXT: vmlava.u32 r12, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #40] +; CHECK-NEXT: vldrh.s32 q1, [r1, #40] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #48] +; CHECK-NEXT: vldrh.s32 q1, [r1, #48] +; CHECK-NEXT: add.w r3, r2, r12 +; CHECK-NEXT: vmlav.u32 r12, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #16] +; CHECK-NEXT: vldrh.s32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r12, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #32] +; CHECK-NEXT: vldrh.s32 q1, [r1, #32] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: add.w r0, r2, r12 +; CHECK-NEXT: add r0, r3 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <32 x i16>* @@ -2858,41 +2861,33 @@ define i32 @mlav32i32i8(i8* %x, i8* %y) { ; CHECK-LABEL: mlav32i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrb.u32 q0, [r0, #28] ; CHECK-NEXT: vldrb.u32 q1, [r1, #28] -; CHECK-NEXT: vldrb.u32 q2, [r1, #12] -; CHECK-NEXT: vldrb.u32 q3, [r1, #4] -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vldrb.u32 q1, [r0, #12] -; CHECK-NEXT: vldrb.u32 q4, [r1] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.u32 q2, [r1, #20] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrb.u32 q1, [r0, #20] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.u32 q2, [r0, #4] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrb.u32 q3, [r1, #8] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.u32 q2, [r1, #24] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrb.u32 q1, [r0, #24] -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.u32 q2, [r0, #8] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrb.u32 q3, [r1, #16] -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrb.u32 q2, [r0, #16] -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vldrb.u32 q3, [r0] -; CHECK-NEXT: vmul.i32 q3, q4, q3 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmlav.u32 r12, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #12] +; CHECK-NEXT: vldrb.u32 q1, [r1, #12] +; CHECK-NEXT: vmlava.u32 r12, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #20] +; CHECK-NEXT: vldrb.u32 q1, [r1, #20] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #24] +; CHECK-NEXT: vldrb.u32 q1, [r1, #24] +; CHECK-NEXT: add.w r3, r2, r12 +; CHECK-NEXT: vmlav.u32 r12, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #8] +; CHECK-NEXT: vldrb.u32 q1, [r1, #8] +; CHECK-NEXT: vmlava.u32 r12, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #16] +; CHECK-NEXT: vldrb.u32 q1, [r1, #16] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: add.w r0, r2, r12 +; CHECK-NEXT: add r0, r3 ; CHECK-NEXT: bx lr entry: %0 = bitcast i8* %x to <32 x i8>* @@ -3151,13 +3146,11 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] -; CHECK-NEXT: vldrh.u16 q2, [r1] -; CHECK-NEXT: vmul.i16 q0, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0] -; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vaddv.u16 r0, q0 -; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vmlav.u16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vmlava.u16 r2, q1, q0 +; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <16 x i16>* @@ -3174,14 +3167,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vldrh.u16 q1, [r1, #32] -; CHECK-NEXT: vldrh.u16 q2, [r1, #16] -; CHECK-NEXT: vmul.i16 q0, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0, #16] -; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r1] -; CHECK-NEXT: vaddv.u16 r2, q0 +; CHECK-NEXT: vmlav.u16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] +; CHECK-NEXT: vldrh.u16 q1, [r1, #16] +; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr @@ -3209,20 +3200,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vldrh.u16 q1, [r1, #48] -; CHECK-NEXT: vldrh.u16 q2, [r1, #16] -; CHECK-NEXT: vldrh.u16 q3, [r1] -; CHECK-NEXT: vmul.i16 q0, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0, #16] -; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vldrh.u16 q2, [r1, #32] -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0, #32] -; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vldrh.u16 q2, [r0] -; CHECK-NEXT: vmul.i16 q2, q3, q2 -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: vmlav.u16 r12, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] +; CHECK-NEXT: vldrh.u16 q1, [r1, #16] +; CHECK-NEXT: vmlava.u16 r12, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #32] +; CHECK-NEXT: vldrh.u16 q1, [r1, #32] +; CHECK-NEXT: vmlav.u16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vmlava.u16 r2, q1, q0 +; CHECK-NEXT: add.w r0, r2, r12 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: bx lr entry: @@ -3648,13 +3636,11 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] ; CHECK-NEXT: vldrb.u8 q1, [r1, #16] -; CHECK-NEXT: vldrb.u8 q2, [r1] -; CHECK-NEXT: vmul.i8 q0, q1, q0 -; CHECK-NEXT: vldrb.u8 q1, [r0] -; CHECK-NEXT: vmul.i8 q1, q2, q1 -; CHECK-NEXT: vadd.i8 q0, q1, q0 -; CHECK-NEXT: vaddv.u8 r0, q0 -; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: vmlav.u8 r2, q1, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vldrb.u8 q1, [r1] +; CHECK-NEXT: vmlava.u8 r2, q1, q0 +; CHECK-NEXT: uxtb r0, r2 ; CHECK-NEXT: bx lr entry: %0 = bitcast i8* %x to <32 x i8>*