diff --git a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h --- a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h @@ -50,6 +50,7 @@ SDValue getIndex() { return Index; } SDValue getIndex() const { return Index; } bool hasValidOffset() const { return Offset.hasValue(); } + int64_t getOffset() const { return *Offset; } // Returns true if `Other` and `*this` are both some offset from the same base // pointer. In that case, `Off` is set to the offset between `*this` and diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -55,6 +55,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -13064,11 +13065,7 @@ return SDValue(); } -static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { - if (!Subtarget->hasMVEIntegerOps()) - return SDValue(); - +static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -13125,6 +13122,106 @@ if (SDValue R = DistrubuteAddAddVecReduce(N1, N0)) return R; + // Distribute add(vecreduce(load(Y)), vecreduce(load(Z))) + // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z))) + // by ascending load offsets. This can help cores prefetch if the order of + // loads is more predictable. + auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) { + // Check if two reductions are known to load data where one is before/after + // another. Return negative if N0 loads data before N1, positive if N1 is + // before N0 and 0 otherwise if nothing is known. + auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) { + // Look through to the first operand of a MUL, for the VMLA case. + // Currently only looks at the first operand, in the hope they are equal. + if (N0.getOpcode() == ISD::MUL) + N0 = N0.getOperand(0); + if (N1.getOpcode() == ISD::MUL) + N1 = N1.getOperand(0); + + // Return true if the two operands are loads to the same object and the + // offset of the first is known to be less than the offset of the second. + LoadSDNode *Load0 = dyn_cast(N0); + LoadSDNode *Load1 = dyn_cast(N1); + if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() || + !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() || + Load1->isIndexed()) + return 0; + + auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG); + auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG); + + if (!BaseLocDecomp0.getBase() || + BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() || + !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset()) + return 0; + if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset()) + return -1; + if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset()) + return 1; + return 0; + }; + + SDValue X; + if (N0.getOpcode() == ISD::ADD) { + if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) { + int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0), + N0.getOperand(1).getOperand(0)); + if (IsBefore < 0) { + X = N0.getOperand(0); + N0 = N0.getOperand(1); + } else if (IsBefore > 0) { + X = N0.getOperand(1); + N0 = N0.getOperand(0); + } else + return SDValue(); + } else if (IsVecReduce(N0.getOperand(0))) { + X = N0.getOperand(1); + N0 = N0.getOperand(0); + } else if (IsVecReduce(N0.getOperand(1))) { + X = N0.getOperand(0); + N0 = N0.getOperand(1); + } else + return SDValue(); + } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) && + IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) { + // Note this is backward to how you would expect. We create + // add(reduce(load + 16), reduce(load + 0)) so that the + // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving + // the X as VADDV(load + 0) + return DAG.getNode(ISD::ADD, dl, VT, N1, N0); + } else + return SDValue(); + + if (!IsVecReduce(N0) || !IsVecReduce(N1)) + return SDValue(); + + if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0) + return SDValue(); + + // Switch from add(add(X, N0), N1) to add(add(X, N1), N0) + SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1); + return DAG.getNode(ISD::ADD, dl, VT, Add0, N0); + }; + if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true)) + return R; + if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false)) + return R; + return SDValue(); +} + +static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasMVEIntegerOps()) + return SDValue(); + + if (SDValue R = TryDistrubutionADDVecReduce(N, DAG)) + return R; + + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc dl(N); + if (VT != MVT::i64) return SDValue(); diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll @@ -34,8 +34,8 @@ define i32 @addv8i32i32(i32* %x) { ; CHECK-LABEL: addv8i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vaddv.u32 r0, q1 ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: bx lr @@ -49,11 +49,11 @@ define i32 @addv16i32i32(i32* %x) { ; CHECK-LABEL: addv16i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vaddva.u32 r2, q0 @@ -69,17 +69,17 @@ define i32 @addv24i32i32(i32* %x) { ; CHECK-LABEL: addv24i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr @@ -98,21 +98,21 @@ define i32 @addv32i32i32(i32* %x) { ; CHECK-LABEL: addv32i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: mov r1, r0 ; CHECK-NEXT: vaddv.u32 r0, q1 ; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1, #96] -; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: vldrw.u32 q0, [r1, #32] ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: vldrw.u32 q0, [r1, #48] ; CHECK-NEXT: vaddva.u32 r0, q0 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: vldrw.u32 q0, [r1, #64] ; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r1, #80] +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vldrw.u32 q0, [r1, #96] +; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: vldrw.u32 q0, [r1, #112] ; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: bx lr @@ -126,36 +126,36 @@ define i32 @addv64i32i32(i32* %x) { ; CHECK-LABEL: addv64i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #176] -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #208] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #224] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #96] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0, #112] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #160] +; CHECK-NEXT: vldrw.u32 q0, [r0, #128] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #144] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #128] +; CHECK-NEXT: vldrw.u32 q0, [r0, #160] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q0, [r0, #176] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #192] ; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #208] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #224] +; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #240] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: mov r0, r2 @@ -446,11 +446,11 @@ define i32 @addv16i32i16(i16* %x) { ; CHECK-LABEL: addv16i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q1, [r0, #16] -; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r0] +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vldrh.s32 q0, [r0, #16] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 @@ -467,11 +467,11 @@ define i32 @addv24i32i16(i16* %x) { ; CHECK-LABEL: addv24i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q1, [r0, #16] -; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r0] +; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #8] +; CHECK-NEXT: vldrh.s32 q0, [r0, #16] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 @@ -496,20 +496,20 @@ define i32 @addv32i32i16(i16* %x) { ; CHECK-LABEL: addv32i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q1, [r0, #40] +; CHECK-NEXT: vldrh.s32 q1, [r0] ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #48] -; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0] -; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #32] ; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #40] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #48] +; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #56] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: mov r0, r2 @@ -525,30 +525,30 @@ define i32 @addv64i32i16(i16* %x) { ; CHECK-LABEL: addv64i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q1, [r0, #40] +; CHECK-NEXT: vldrh.s32 q1, [r0] ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: ldrsh.w r1, [r0, #120] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: ldrsh.w r3, [r0, #122] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #48] -; CHECK-NEXT: ldrsh.w r12, [r0, #124] -; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] +; CHECK-NEXT: ldrsh.w r12, [r0, #124] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q0, [r0, #32] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #64] +; CHECK-NEXT: vldrh.s32 q0, [r0, #40] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #80] +; CHECK-NEXT: vldrh.s32 q0, [r0, #48] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #32] +; CHECK-NEXT: vldrh.s32 q0, [r0, #56] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #64] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #72] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrh.s32 q0, [r0, #56] +; CHECK-NEXT: vldrh.s32 q0, [r0, #80] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #88] ; CHECK-NEXT: vaddva.u32 r2, q0 @@ -801,10 +801,10 @@ define i32 @addv24i32i8(i8* %x) { ; CHECK-LABEL: addv24i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q1, [r0, #16] -; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: vaddv.u16 r0, q1 -; CHECK-NEXT: vaddva.u8 r0, q0 +; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: vldrb.u16 q0, [r0, #16] +; CHECK-NEXT: vaddv.u8 r0, q1 +; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = bitcast i8* %x to <16 x i8>* @@ -823,20 +823,20 @@ define i32 @addv32i32i8(i8* %x) { ; CHECK-LABEL: addv32i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q1, [r0, #20] +; CHECK-NEXT: vldrb.u32 q1, [r0] ; CHECK-NEXT: vldrb.u32 q0, [r0, #4] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrb.u32 q0, [r0, #24] -; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #8] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #12] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrb.u32 q0, [r0] -; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #16] ; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #20] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #24] +; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #28] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: mov r0, r2 @@ -852,23 +852,23 @@ define i32 @addv64i32i8(i8* %x) { ; CHECK-LABEL: addv64i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q1, [r0, #20] +; CHECK-NEXT: vldrb.u32 q1, [r0] ; CHECK-NEXT: vldrb.u32 q0, [r0, #4] ; CHECK-NEXT: ldrb.w r1, [r0, #60] ; CHECK-NEXT: vaddv.u32 r2, q1 ; CHECK-NEXT: ldrb.w r3, [r0, #61] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrb.u32 q0, [r0, #24] -; CHECK-NEXT: ldrb.w r12, [r0, #62] -; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #8] +; CHECK-NEXT: ldrb.w r12, [r0, #62] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #12] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrb.u32 q0, [r0] -; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #16] ; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #20] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #24] +; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #28] ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] @@ -1043,8 +1043,8 @@ define signext i16 @addv16i16i16(i16* %x) { ; CHECK-LABEL: addv16i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q1, [r0, #16] -; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r0] +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.u16 r0, q1 ; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: sxth r0, r0 @@ -1059,11 +1059,11 @@ define signext i16 @addv24i16i16(i16* %x) { ; CHECK-LABEL: addv24i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q1, [r0, #32] +; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.u16 r2, q1 ; CHECK-NEXT: vaddva.u16 r2, q0 -; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr @@ -1082,11 +1082,11 @@ define signext i16 @addv32i16i16(i16* %x) { ; CHECK-LABEL: addv32i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q1, [r0, #32] -; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r0] +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.u16 r2, q1 ; CHECK-NEXT: vaddva.u16 r2, q0 -; CHECK-NEXT: vldrh.u16 q0, [r0, #16] +; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vaddva.u16 r2, q0 @@ -1102,20 +1102,20 @@ define signext i16 @addv64i16i16(i16* %x) { ; CHECK-LABEL: addv64i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q1, [r0, #80] +; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vaddv.u16 r2, q1 ; CHECK-NEXT: vaddva.u16 r2, q0 -; CHECK-NEXT: vldrh.u16 q0, [r0, #96] -; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vaddva.u16 r2, q0 -; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #64] ; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #80] +; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #96] +; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #112] ; CHECK-NEXT: vaddva.u16 r2, q0 ; CHECK-NEXT: sxth r0, r2 @@ -1328,8 +1328,8 @@ define zeroext i8 @addv32i8i8(i8* %x) { ; CHECK-LABEL: addv32i8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q1, [r0, #16] -; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: vldrb.u8 q0, [r0, #16] ; CHECK-NEXT: vaddv.u8 r0, q1 ; CHECK-NEXT: vaddva.u8 r0, q0 ; CHECK-NEXT: uxtb r0, r0 @@ -1344,11 +1344,11 @@ define zeroext i8 @addv64i8i8(i8* %x) { ; CHECK-LABEL: addv64i8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q1, [r0, #32] -; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vldrb.u8 q1, [r0] +; CHECK-NEXT: vldrb.u8 q0, [r0, #16] ; CHECK-NEXT: vaddv.u8 r2, q1 ; CHECK-NEXT: vaddva.u8 r2, q0 -; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vaddva.u8 r2, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] ; CHECK-NEXT: vaddva.u8 r2, q0 @@ -1467,11 +1467,11 @@ define i32 @mlav8i32i32(i32* %x, i32* %y) { ; CHECK-LABEL: mlav8i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmlav.u32 r2, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr @@ -1488,15 +1488,15 @@ define i32 @mlav16i32i32(i32* %x, i32* %y) { ; CHECK-LABEL: mlav16i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r1, #32] -; CHECK-NEXT: vmlav.u32 r2, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vmlav.u32 r2, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r1, #32] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 @@ -1515,25 +1515,25 @@ define i32 @mlav24i32i32(i32* %x, i32* %y) { ; CHECK-LABEL: mlav24i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r1, #64] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #16] +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #64] +; CHECK-NEXT: vldrw.u32 q1, [r1, #64] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #80] ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #16] -; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = bitcast i32* %x to <8 x i32>* @@ -1557,28 +1557,28 @@ define i32 @mlav32i32i32(i32* %x, i32* %y) { ; CHECK-LABEL: mlav32i32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q1, [r1, #80] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #16] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2, #96] -; CHECK-NEXT: vldrw.u32 q1, [r1, #96] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #32] ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #48] ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #64] ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #80] +; CHECK-NEXT: vldrw.u32 q1, [r1, #80] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r2, #96] +; CHECK-NEXT: vldrw.u32 q1, [r1, #96] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r2, #112] ; CHECK-NEXT: vldrw.u32 q1, [r1, #112] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 @@ -2249,15 +2249,15 @@ define i32 @mlav16i32i16(i16* %x, i16* %y) { ; CHECK-LABEL: mlav16i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r0, #16] -; CHECK-NEXT: vldrh.s32 q1, [r1, #16] -; CHECK-NEXT: vmlav.u32 r2, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vmlav.u32 r2, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r0, #16] +; CHECK-NEXT: vldrh.s32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] ; CHECK-NEXT: vmlava.u32 r2, q1, q0 @@ -2278,22 +2278,22 @@ define i32 @mlav24i32i16(i16* %x, i16* %y) { ; CHECK-LABEL: mlav24i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r0, #32] -; CHECK-NEXT: vldrh.s32 q1, [r1, #32] +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmlav.u32 r0, q1, q0 +; CHECK-NEXT: vmlav.s16 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #16] ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #24] ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #32] +; CHECK-NEXT: vldrh.s32 q1, [r1, #32] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #40] ; CHECK-NEXT: vldrh.s32 q1, [r1, #40] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r2] -; CHECK-NEXT: vldrh.u16 q1, [r1] -; CHECK-NEXT: vmlava.s16 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = bitcast i16* %x to <8 x i16>* @@ -2321,28 +2321,28 @@ define i32 @mlav32i32i16(i16* %x, i16* %y) { ; CHECK-LABEL: mlav32i32i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r0, #40] -; CHECK-NEXT: vldrh.s32 q1, [r1, #40] +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: vldrh.s32 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #8] ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2, #48] -; CHECK-NEXT: vldrh.s32 q1, [r1, #48] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #16] ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #24] ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrh.s32 q0, [r2] -; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #32] ; CHECK-NEXT: vldrh.s32 q1, [r1, #32] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #40] +; CHECK-NEXT: vldrh.s32 q1, [r1, #40] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrh.s32 q0, [r2, #48] +; CHECK-NEXT: vldrh.s32 q1, [r1, #48] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrh.s32 q0, [r2, #56] ; CHECK-NEXT: vldrh.s32 q1, [r1, #56] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 @@ -2828,28 +2828,28 @@ define i32 @mlav32i32i8(i8* %x, i8* %y) { ; CHECK-LABEL: mlav32i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r0, #20] -; CHECK-NEXT: vldrb.u32 q1, [r1, #20] +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: vldrb.u32 q1, [r1] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: vmlav.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #4] ; CHECK-NEXT: vldrb.u32 q1, [r1, #4] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r2, #24] -; CHECK-NEXT: vldrb.u32 q1, [r1, #24] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #8] ; CHECK-NEXT: vldrb.u32 q1, [r1, #8] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #12] ; CHECK-NEXT: vldrb.u32 q1, [r1, #12] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 -; CHECK-NEXT: vldrb.u32 q0, [r2] -; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #16] ; CHECK-NEXT: vldrb.u32 q1, [r1, #16] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #20] +; CHECK-NEXT: vldrb.u32 q1, [r1, #20] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #24] +; CHECK-NEXT: vldrb.u32 q1, [r1, #24] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: vldrb.u32 q0, [r2, #28] ; CHECK-NEXT: vldrb.u32 q1, [r1, #28] ; CHECK-NEXT: vmlava.u32 r0, q1, q0 @@ -3109,11 +3109,11 @@ define signext i16 @mlav16i16i16(i16* %x, i16* %y) { ; CHECK-LABEL: mlav16i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r0, #16] -; CHECK-NEXT: vldrh.u16 q1, [r1, #16] -; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vmlav.u16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] +; CHECK-NEXT: vldrh.u16 q1, [r1, #16] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr @@ -3130,14 +3130,14 @@ define signext i16 @mlav24i16i16(i16* %x, i16* %y) { ; CHECK-LABEL: mlav24i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r0, #32] -; CHECK-NEXT: vldrh.u16 q1, [r1, #32] +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] ; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, #32] +; CHECK-NEXT: vldrh.u16 q1, [r1, #32] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: bx lr @@ -3163,15 +3163,15 @@ define signext i16 @mlav32i16i16(i16* %x, i16* %y) { ; CHECK-LABEL: mlav32i16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r0, #32] -; CHECK-NEXT: vldrh.u16 q1, [r1, #32] -; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0] ; CHECK-NEXT: vldrh.u16 q1, [r1] -; CHECK-NEXT: vmlava.u16 r2, q1, q0 +; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #32] +; CHECK-NEXT: vldrh.u16 q1, [r1, #32] +; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] ; CHECK-NEXT: vldrh.u16 q1, [r1, #48] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 @@ -3598,11 +3598,11 @@ define zeroext i8 @mlav32i8i8(i8* %x, i8* %y) { ; CHECK-LABEL: mlav32i8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r0, #16] -; CHECK-NEXT: vldrb.u8 q1, [r1, #16] -; CHECK-NEXT: vmlav.u8 r2, q1, q0 ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vldrb.u8 q1, [r1] +; CHECK-NEXT: vmlav.u8 r2, q1, q0 +; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vldrb.u8 q1, [r1, #16] ; CHECK-NEXT: vmlava.u8 r2, q1, q0 ; CHECK-NEXT: uxtb r0, r2 ; CHECK-NEXT: bx lr