diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -604,6 +604,7 @@ SDValue combineRepeatedFPDivisors(SDNode *N); SDValue mergeInsertEltWithShuffle(SDNode *N, unsigned InsIndex); SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex); + SDValue combineInsertEltToLoad(SDNode *N, unsigned InsIndex); SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); @@ -20952,6 +20953,99 @@ return DAG.getBitcast(VT, Shuf); } +// Combine insert(shuffle(load, ), load, 0) into a single load if +// possible and the new load will be quick. We use more loads but less shuffles +// and inserts. +SDValue DAGCombiner::combineInsertEltToLoad(SDNode *N, unsigned InsIndex) { + EVT VT = N->getValueType(0); + + // InsIndex is expected to be the first of last lane. + if (!VT.isFixedLengthVector() || + (InsIndex != 0 && InsIndex != VT.getVectorNumElements() - 1)) + return SDValue(); + + // Look for a shuffle with the mask u,0,1,2,3,4,5,6 or 1,2,3,4,5,6,7,u + // depending on the InsIndex. + auto *Shuffle = dyn_cast(N->getOperand(0)); + SDValue Scalar = N->getOperand(1); + if (!Shuffle || !all_of(enumerate(Shuffle->getMask()), [&](auto P) { + return InsIndex == P.index() || P.value() < 0 || + (InsIndex == 0 && P.value() == (int)P.index() - 1) || + (InsIndex == VT.getVectorNumElements() - 1 && + P.value() == (int)P.index() + 1); + })) + return SDValue(); + + // We optionally skip over an extend so long as both loads are extended in the + // same way from the same type. + unsigned Extend = 0; + if (Scalar.getOpcode() == ISD::ZERO_EXTEND || + Scalar.getOpcode() == ISD::SIGN_EXTEND || + Scalar.getOpcode() == ISD::ANY_EXTEND) { + Extend = Scalar.getOpcode(); + Scalar = Scalar.getOperand(0); + } + + auto *ScalarLoad = dyn_cast(Scalar); + if (!ScalarLoad) + return SDValue(); + + SDValue Vec = Shuffle->getOperand(0); + if (Extend) { + if (Vec.getOpcode() != Extend) + return SDValue(); + Vec = Vec.getOperand(0); + } + auto *VecLoad = dyn_cast(Vec); + if (!VecLoad || Vec.getValueType().getScalarType() != Scalar.getValueType()) + return SDValue(); + + int EltSize = ScalarLoad->getValueType(0).getScalarSizeInBits(); + if (EltSize == 0 || EltSize % 8 != 0 || !ScalarLoad->isSimple() || + !VecLoad->isSimple() || VecLoad->getExtensionType() != ISD::NON_EXTLOAD || + ScalarLoad->getExtensionType() != ISD::NON_EXTLOAD || + ScalarLoad->getAddressSpace() != VecLoad->getAddressSpace()) + return SDValue(); + + // Check that the offset between the pointers to produce a single continuous + // load. + if (InsIndex == 0) { + if (!DAG.areNonVolatileConsecutiveLoads(ScalarLoad, VecLoad, EltSize / 8, + -1)) + return SDValue(); + } else { + if (!DAG.areNonVolatileConsecutiveLoads( + VecLoad, ScalarLoad, VT.getVectorNumElements() * EltSize / 8, -1)) + return SDValue(); + } + + // And that the new unaligned load will be fast. + unsigned IsFast = 0; + Align NewAlign = commonAlignment(VecLoad->getAlign(), EltSize / 8); + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + Vec.getValueType(), VecLoad->getAddressSpace(), + NewAlign, VecLoad->getMemOperand()->getFlags(), + &IsFast) || + !IsFast) + return SDValue(); + + // Calculate the new Ptr and create the new load. + SDLoc DL(N); + SDValue Ptr = ScalarLoad->getBasePtr(); + if (InsIndex != 0) + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), VecLoad->getBasePtr(), + DAG.getConstant(EltSize / 8, DL, Ptr.getValueType())); + MachinePointerInfo PtrInfo = + InsIndex == 0 ? ScalarLoad->getPointerInfo() + : VecLoad->getPointerInfo().getWithOffset(EltSize / 8); + + SDValue Load = DAG.getLoad(VecLoad->getValueType(0), DL, + ScalarLoad->getChain(), Ptr, PtrInfo, NewAlign); + DAG.makeEquivalentMemoryOrdering(ScalarLoad, Load.getValue(1)); + DAG.makeEquivalentMemoryOrdering(VecLoad, Load.getValue(1)); + return Extend ? DAG.getNode(Extend, DL, VT, Load) : Load; +} + SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); @@ -21023,6 +21117,9 @@ if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) return Shuf; + if (SDValue Shuf = combineInsertEltToLoad(N, Elt)) + return Shuf; + // Attempt to convert an insert_vector_elt chain into a legal build_vector. if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) { // vXi1 vector - we don't need to recurse. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -11700,7 +11700,7 @@ int64_t Offset = 0; if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset)) - return (Dist * Bytes == Offset); + return (Dist * (int64_t)Bytes == Offset); return false; } diff --git a/llvm/test/CodeGen/AArch64/insertshuffleload.ll b/llvm/test/CodeGen/AArch64/insertshuffleload.ll --- a/llvm/test/CodeGen/AArch64/insertshuffleload.ll +++ b/llvm/test/CodeGen/AArch64/insertshuffleload.ll @@ -4,10 +4,7 @@ define <8 x i8> @inserti8_first(ptr %p) { ; CHECK-LABEL: inserti8_first: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur d0, [x0, #1] -; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7 -; CHECK-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -20,11 +17,7 @@ define <8 x i8> @inserti8_last(ptr %p) { ; CHECK-LABEL: inserti8_last: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #1 -; CHECK-NEXT: ld1 { v0.b }[7], [x8] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ldur d0, [x0, #1] ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 8 %l1 = load <8 x i8>, ptr %p @@ -37,11 +30,8 @@ define <8 x i16> @inserti8_first_sext(ptr %p) { ; CHECK-LABEL: inserti8_first_sext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur d0, [x0, #1] -; CHECK-NEXT: ldrsb w8, [x0] +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14 -; CHECK-NEXT: mov v0.h[0], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -56,11 +46,8 @@ define <8 x i16> @inserti8_last_sext(ptr %p) { ; CHECK-LABEL: inserti8_last_sext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldrsb w8, [x0, #8] +; CHECK-NEXT: ldur d0, [x0, #1] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #2 -; CHECK-NEXT: mov v0.h[7], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 8 %l1 = load <8 x i8>, ptr %p @@ -75,11 +62,8 @@ define <8 x i16> @inserti8_first_zext(ptr %p) { ; CHECK-LABEL: inserti8_first_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur d0, [x0, #1] -; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14 -; CHECK-NEXT: mov v0.h[0], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -94,11 +78,8 @@ define <8 x i16> @inserti8_last_zext(ptr %p) { ; CHECK-LABEL: inserti8_last_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldrb w8, [x0, #8] +; CHECK-NEXT: ldur d0, [x0, #1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #2 -; CHECK-NEXT: mov v0.h[7], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 8 %l1 = load <8 x i8>, ptr %p @@ -113,11 +94,7 @@ define <8 x i32> @inserti32_first(ptr %p) { ; CHECK-LABEL: inserti32_first: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur q1, [x0, #4] -; CHECK-NEXT: ldur q2, [x0, #20] -; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12 -; CHECK-NEXT: ext v1.16b, v1.16b, v2.16b, #12 -; CHECK-NEXT: ld1 { v0.s }[0], [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 4 %l1 = load <8 x i32>, ptr %q @@ -130,11 +107,8 @@ define <8 x i32> @inserti32_last(ptr %p) { ; CHECK-LABEL: inserti32_last: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q0, [x0] -; CHECK-NEXT: add x8, x0, #32 -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: ext v0.16b, v2.16b, v0.16b, #4 -; CHECK-NEXT: ld1 { v1.s }[3], [x8] +; CHECK-NEXT: ldur q0, [x0, #4] +; CHECK-NEXT: ldur q1, [x0, #20] ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 32 %l1 = load <8 x i32>, ptr %p @@ -147,11 +121,9 @@ define <8 x i32> @inserti32_first_multiuse(ptr %p) { ; CHECK-LABEL: inserti32_first_multiuse: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur q0, [x0, #4] +; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: ldur q1, [x0, #20] -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #12 -; CHECK-NEXT: ext v3.16b, v0.16b, v1.16b, #12 -; CHECK-NEXT: ld1 { v2.s }[0], [x0] +; CHECK-NEXT: ldur q0, [x0, #4] ; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret @@ -168,12 +140,10 @@ ; CHECK-LABEL: inserti32_last_multiuse: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: add x8, x0, #32 -; CHECK-NEXT: ext v2.16b, v1.16b, v0.16b, #4 -; CHECK-NEXT: ext v3.16b, v0.16b, v1.16b, #4 -; CHECK-NEXT: ld1 { v2.s }[3], [x8] -; CHECK-NEXT: add v0.4s, v0.4s, v3.4s -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldur q2, [x0, #4] +; CHECK-NEXT: ldur q3, [x0, #20] +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 32 %l1 = load <8 x i32>, ptr %p @@ -187,9 +157,7 @@ define <4 x float> @insertf32_first(ptr %p) { ; CHECK-LABEL: insertf32_first: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur q0, [x0, #4] -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #12 -; CHECK-NEXT: ld1 { v0.s }[0], [x0] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 4 %l1 = load <4 x float>, ptr %q @@ -202,10 +170,7 @@ define <4 x float> @insertf32_last(ptr %p) { ; CHECK-LABEL: insertf32_last: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: add x8, x0, #16 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: ld1 { v0.s }[3], [x8] +; CHECK-NEXT: ldur q0, [x0, #4] ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 16 %l1 = load <4 x float>, ptr %p @@ -218,9 +183,7 @@ define <2 x i64> @inserti64_first(ptr %p) { ; CHECK-LABEL: inserti64_first: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: ld1r { v0.2d }, [x8] -; CHECK-NEXT: ld1 { v0.d }[0], [x0] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 8 %l1 = load <2 x i64>, ptr %q @@ -233,10 +196,7 @@ define <2 x i64> @inserti64_last(ptr %p) { ; CHECK-LABEL: inserti64_last: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: add x8, x0, #16 -; CHECK-NEXT: dup v0.2d, v0.d[1] -; CHECK-NEXT: ld1 { v0.d }[1], [x8] +; CHECK-NEXT: ldur q0, [x0, #8] ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 16 %l1 = load <2 x i64>, ptr %p @@ -249,10 +209,7 @@ define <8 x i8> @inserti8_first_undef(ptr %p) { ; CHECK-LABEL: inserti8_first_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur d0, [x0, #1] -; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7 -; CHECK-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -265,11 +222,7 @@ define <8 x i8> @inserti8_last_undef(ptr %p) { ; CHECK-LABEL: inserti8_last_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: dup v0.8b, v0.b[1] -; CHECK-NEXT: ld1 { v0.b }[7], [x8] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ldur d0, [x0, #1] ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 8 %l1 = load <8 x i8>, ptr %p @@ -445,10 +398,7 @@ ; CHECK-LABEL: storebefore: ; CHECK: // %bb.0: ; CHECK-NEXT: strb wzr, [x1] -; CHECK-NEXT: ldur d0, [x0, #1] -; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7 -; CHECK-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 1 store i8 0, ptr %r @@ -462,11 +412,8 @@ define <8 x i8> @storeafter(ptr %p, ptr %r) { ; CHECK-LABEL: storeafter: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur d0, [x0, #1] -; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #7 -; CHECK-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: strb wzr, [x1] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q diff --git a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll --- a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll +++ b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll @@ -6,37 +6,13 @@ define <8 x i8> @inserti8_first(ptr %p) { ; CHECKLE-LABEL: inserti8_first: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrb.u16 q1, [r0, #1] -; CHECKLE-NEXT: ldrb r1, [r0] -; CHECKLE-NEXT: vmovx.f16 s10, s5 -; CHECKLE-NEXT: vmovx.f16 s8, s4 -; CHECKLE-NEXT: vins.f16 s10, s6 -; CHECKLE-NEXT: vmovx.f16 s6, s6 -; CHECKLE-NEXT: vmov.16 q0[0], r1 -; CHECKLE-NEXT: vins.f16 s8, s5 -; CHECKLE-NEXT: vins.f16 s6, s7 -; CHECKLE-NEXT: vmov.f32 s1, s8 -; CHECKLE-NEXT: vmov.f32 s2, s10 -; CHECKLE-NEXT: vins.f16 s0, s4 -; CHECKLE-NEXT: vmov.f32 s3, s6 +; CHECKLE-NEXT: vldrb.u16 q0, [r0] ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti8_first: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrb.u16 q0, [r0, #1] -; CHECKBE-NEXT: ldrb r1, [r0] -; CHECKBE-NEXT: vmovx.f16 s6, s1 -; CHECKBE-NEXT: vmovx.f16 s4, s0 -; CHECKBE-NEXT: vins.f16 s6, s2 -; CHECKBE-NEXT: vmovx.f16 s2, s2 -; CHECKBE-NEXT: vmov.16 q2[0], r1 -; CHECKBE-NEXT: vins.f16 s4, s1 -; CHECKBE-NEXT: vins.f16 s2, s3 -; CHECKBE-NEXT: vins.f16 s8, s0 -; CHECKBE-NEXT: vmov.f32 s9, s4 -; CHECKBE-NEXT: vmov.f32 s10, s6 -; CHECKBE-NEXT: vmov.f32 s11, s2 -; CHECKBE-NEXT: vrev64.16 q0, q2 +; CHECKBE-NEXT: vldrb.u16 q1, [r0] +; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -49,32 +25,12 @@ define <8 x i8> @inserti8_last(ptr %p) { ; CHECKLE-LABEL: inserti8_last: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrb.u16 q1, [r0] -; CHECKLE-NEXT: ldrb r1, [r0, #8] -; CHECKLE-NEXT: vmovx.f16 s0, s4 -; CHECKLE-NEXT: vmovx.f16 s1, s5 -; CHECKLE-NEXT: vmovx.f16 s2, s6 -; CHECKLE-NEXT: vins.f16 s0, s5 -; CHECKLE-NEXT: vins.f16 s1, s6 -; CHECKLE-NEXT: vins.f16 s2, s7 -; CHECKLE-NEXT: vmov.u16 r0, q1[7] -; CHECKLE-NEXT: vmov.16 q0[6], r0 -; CHECKLE-NEXT: vmov.16 q0[7], r1 +; CHECKLE-NEXT: vldrb.u16 q0, [r0, #1] ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti8_last: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrb.u16 q0, [r0] -; CHECKBE-NEXT: ldrb r1, [r0, #8] -; CHECKBE-NEXT: vmovx.f16 s4, s0 -; CHECKBE-NEXT: vmovx.f16 s5, s1 -; CHECKBE-NEXT: vmovx.f16 s6, s2 -; CHECKBE-NEXT: vins.f16 s4, s1 -; CHECKBE-NEXT: vins.f16 s5, s2 -; CHECKBE-NEXT: vins.f16 s6, s3 -; CHECKBE-NEXT: vmov.u16 r0, q0[7] -; CHECKBE-NEXT: vmov.16 q1[6], r0 -; CHECKBE-NEXT: vmov.16 q1[7], r1 +; CHECKBE-NEXT: vldrb.u16 q1, [r0, #1] ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 8 @@ -88,37 +44,13 @@ define <8 x i16> @inserti8_first_sext(ptr %p) { ; CHECKLE-LABEL: inserti8_first_sext: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrb.s16 q1, [r0, #1] -; CHECKLE-NEXT: ldrsb.w r1, [r0] -; CHECKLE-NEXT: vmovx.f16 s10, s5 -; CHECKLE-NEXT: vmovx.f16 s8, s4 -; CHECKLE-NEXT: vins.f16 s10, s6 -; CHECKLE-NEXT: vmovx.f16 s6, s6 -; CHECKLE-NEXT: vmov.16 q0[0], r1 -; CHECKLE-NEXT: vins.f16 s8, s5 -; CHECKLE-NEXT: vins.f16 s6, s7 -; CHECKLE-NEXT: vmov.f32 s1, s8 -; CHECKLE-NEXT: vmov.f32 s2, s10 -; CHECKLE-NEXT: vins.f16 s0, s4 -; CHECKLE-NEXT: vmov.f32 s3, s6 +; CHECKLE-NEXT: vldrb.s16 q0, [r0] ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti8_first_sext: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrb.s16 q0, [r0, #1] -; CHECKBE-NEXT: ldrsb.w r1, [r0] -; CHECKBE-NEXT: vmovx.f16 s6, s1 -; CHECKBE-NEXT: vmovx.f16 s4, s0 -; CHECKBE-NEXT: vins.f16 s6, s2 -; CHECKBE-NEXT: vmovx.f16 s2, s2 -; CHECKBE-NEXT: vmov.16 q2[0], r1 -; CHECKBE-NEXT: vins.f16 s4, s1 -; CHECKBE-NEXT: vins.f16 s2, s3 -; CHECKBE-NEXT: vins.f16 s8, s0 -; CHECKBE-NEXT: vmov.f32 s9, s4 -; CHECKBE-NEXT: vmov.f32 s10, s6 -; CHECKBE-NEXT: vmov.f32 s11, s2 -; CHECKBE-NEXT: vrev64.16 q0, q2 +; CHECKBE-NEXT: vldrb.s16 q1, [r0] +; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -133,32 +65,12 @@ define <8 x i16> @inserti8_last_sext(ptr %p) { ; CHECKLE-LABEL: inserti8_last_sext: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrb.s16 q1, [r0] -; CHECKLE-NEXT: ldrsb.w r1, [r0, #8] -; CHECKLE-NEXT: vmovx.f16 s0, s4 -; CHECKLE-NEXT: vmovx.f16 s1, s5 -; CHECKLE-NEXT: vmovx.f16 s2, s6 -; CHECKLE-NEXT: vins.f16 s0, s5 -; CHECKLE-NEXT: vins.f16 s1, s6 -; CHECKLE-NEXT: vins.f16 s2, s7 -; CHECKLE-NEXT: vmov.u16 r0, q1[7] -; CHECKLE-NEXT: vmov.16 q0[6], r0 -; CHECKLE-NEXT: vmov.16 q0[7], r1 +; CHECKLE-NEXT: vldrb.s16 q0, [r0, #1] ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti8_last_sext: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrb.s16 q0, [r0] -; CHECKBE-NEXT: ldrsb.w r1, [r0, #8] -; CHECKBE-NEXT: vmovx.f16 s4, s0 -; CHECKBE-NEXT: vmovx.f16 s5, s1 -; CHECKBE-NEXT: vmovx.f16 s6, s2 -; CHECKBE-NEXT: vins.f16 s4, s1 -; CHECKBE-NEXT: vins.f16 s5, s2 -; CHECKBE-NEXT: vins.f16 s6, s3 -; CHECKBE-NEXT: vmov.u16 r0, q0[7] -; CHECKBE-NEXT: vmov.16 q1[6], r0 -; CHECKBE-NEXT: vmov.16 q1[7], r1 +; CHECKBE-NEXT: vldrb.s16 q1, [r0, #1] ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 8 @@ -174,37 +86,13 @@ define <8 x i16> @inserti8_first_zext(ptr %p) { ; CHECKLE-LABEL: inserti8_first_zext: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrb.u16 q1, [r0, #1] -; CHECKLE-NEXT: ldrb r1, [r0] -; CHECKLE-NEXT: vmovx.f16 s10, s5 -; CHECKLE-NEXT: vmovx.f16 s8, s4 -; CHECKLE-NEXT: vins.f16 s10, s6 -; CHECKLE-NEXT: vmovx.f16 s6, s6 -; CHECKLE-NEXT: vmov.16 q0[0], r1 -; CHECKLE-NEXT: vins.f16 s8, s5 -; CHECKLE-NEXT: vins.f16 s6, s7 -; CHECKLE-NEXT: vmov.f32 s1, s8 -; CHECKLE-NEXT: vmov.f32 s2, s10 -; CHECKLE-NEXT: vins.f16 s0, s4 -; CHECKLE-NEXT: vmov.f32 s3, s6 +; CHECKLE-NEXT: vldrb.u16 q0, [r0] ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti8_first_zext: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrb.u16 q0, [r0, #1] -; CHECKBE-NEXT: ldrb r1, [r0] -; CHECKBE-NEXT: vmovx.f16 s6, s1 -; CHECKBE-NEXT: vmovx.f16 s4, s0 -; CHECKBE-NEXT: vins.f16 s6, s2 -; CHECKBE-NEXT: vmovx.f16 s2, s2 -; CHECKBE-NEXT: vmov.16 q2[0], r1 -; CHECKBE-NEXT: vins.f16 s4, s1 -; CHECKBE-NEXT: vins.f16 s2, s3 -; CHECKBE-NEXT: vins.f16 s8, s0 -; CHECKBE-NEXT: vmov.f32 s9, s4 -; CHECKBE-NEXT: vmov.f32 s10, s6 -; CHECKBE-NEXT: vmov.f32 s11, s2 -; CHECKBE-NEXT: vrev64.16 q0, q2 +; CHECKBE-NEXT: vldrb.u16 q1, [r0] +; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -219,32 +107,12 @@ define <8 x i16> @inserti8_last_zext(ptr %p) { ; CHECKLE-LABEL: inserti8_last_zext: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrb.u16 q1, [r0] -; CHECKLE-NEXT: ldrb r1, [r0, #8] -; CHECKLE-NEXT: vmovx.f16 s0, s4 -; CHECKLE-NEXT: vmovx.f16 s1, s5 -; CHECKLE-NEXT: vmovx.f16 s2, s6 -; CHECKLE-NEXT: vins.f16 s0, s5 -; CHECKLE-NEXT: vins.f16 s1, s6 -; CHECKLE-NEXT: vins.f16 s2, s7 -; CHECKLE-NEXT: vmov.u16 r0, q1[7] -; CHECKLE-NEXT: vmov.16 q0[6], r0 -; CHECKLE-NEXT: vmov.16 q0[7], r1 +; CHECKLE-NEXT: vldrb.u16 q0, [r0, #1] ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti8_last_zext: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrb.u16 q0, [r0] -; CHECKBE-NEXT: ldrb r1, [r0, #8] -; CHECKBE-NEXT: vmovx.f16 s4, s0 -; CHECKBE-NEXT: vmovx.f16 s5, s1 -; CHECKBE-NEXT: vmovx.f16 s6, s2 -; CHECKBE-NEXT: vins.f16 s4, s1 -; CHECKBE-NEXT: vins.f16 s5, s2 -; CHECKBE-NEXT: vins.f16 s6, s3 -; CHECKBE-NEXT: vmov.u16 r0, q0[7] -; CHECKBE-NEXT: vmov.16 q1[6], r0 -; CHECKBE-NEXT: vmov.16 q1[7], r1 +; CHECKBE-NEXT: vldrb.u16 q1, [r0, #1] ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 8 @@ -260,14 +128,9 @@ define <8 x i32> @inserti32_first(ptr %p) { ; CHECKLE-LABEL: inserti32_first: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrw.u32 q1, [r0, #4] ; CHECKLE-NEXT: vldrw.u32 q2, [r0, #20] -; CHECKLE-NEXT: ldr r1, [r0] -; CHECKLE-NEXT: vmov.f32 s1, s4 -; CHECKLE-NEXT: vmov.f32 s2, s5 -; CHECKLE-NEXT: vmov.f32 s3, s6 -; CHECKLE-NEXT: vmov.f32 s4, s7 -; CHECKLE-NEXT: vmov.32 q0[0], r1 +; CHECKLE-NEXT: vldr s4, [r0, #16] +; CHECKLE-NEXT: vldrw.u32 q0, [r0] ; CHECKLE-NEXT: vmov.f32 s5, s8 ; CHECKLE-NEXT: vmov.f32 s6, s9 ; CHECKLE-NEXT: vmov.f32 s7, s10 @@ -275,19 +138,14 @@ ; ; CHECKBE-LABEL: inserti32_first: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrw.u32 q0, [r0, #20] -; CHECKBE-NEXT: vldrw.u32 q2, [r0, #4] -; CHECKBE-NEXT: ldr r1, [r0] -; CHECKBE-NEXT: vmov.f32 s12, s11 -; CHECKBE-NEXT: vmov.f32 s13, s0 -; CHECKBE-NEXT: vmov.f32 s14, s1 -; CHECKBE-NEXT: vmov.f32 s15, s2 -; CHECKBE-NEXT: vrev64.32 q1, q3 -; CHECKBE-NEXT: vmov.f32 s13, s8 -; CHECKBE-NEXT: vmov.f32 s14, s9 -; CHECKBE-NEXT: vmov.f32 s15, s10 -; CHECKBE-NEXT: vmov.32 q3[0], r1 -; CHECKBE-NEXT: vrev64.32 q0, q3 +; CHECKBE-NEXT: vldrw.u32 q3, [r0, #20] +; CHECKBE-NEXT: vldrb.u8 q1, [r0] +; CHECKBE-NEXT: vldr s8, [r0, #16] +; CHECKBE-NEXT: vmov.f32 s9, s12 +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: vmov.f32 s10, s13 +; CHECKBE-NEXT: vmov.f32 s11, s14 +; CHECKBE-NEXT: vrev64.32 q1, q2 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 4 %l1 = load <8 x i32>, ptr %q @@ -300,34 +158,24 @@ define <8 x i32> @inserti32_last(ptr %p) { ; CHECKLE-LABEL: inserti32_last: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrw.u32 q2, [r0, #16] -; CHECKLE-NEXT: vldrw.u32 q0, [r0] -; CHECKLE-NEXT: ldr r1, [r0, #32] -; CHECKLE-NEXT: vmov.f32 s0, s1 -; CHECKLE-NEXT: vmov.f32 s1, s2 -; CHECKLE-NEXT: vmov.f32 s2, s3 -; CHECKLE-NEXT: vmov.f32 s3, s8 -; CHECKLE-NEXT: vmov.f32 s4, s9 -; CHECKLE-NEXT: vmov.f32 s5, s10 -; CHECKLE-NEXT: vmov.f32 s6, s11 -; CHECKLE-NEXT: vmov.32 q1[3], r1 +; CHECKLE-NEXT: vldrw.u32 q2, [r0] +; CHECKLE-NEXT: vldr s3, [r0, #16] +; CHECKLE-NEXT: vldrw.u32 q1, [r0, #20] +; CHECKLE-NEXT: vmov.f32 s0, s9 +; CHECKLE-NEXT: vmov.f32 s1, s10 +; CHECKLE-NEXT: vmov.f32 s2, s11 ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti32_last: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrw.u32 q0, [r0] -; CHECKBE-NEXT: vldrw.u32 q1, [r0, #16] -; CHECKBE-NEXT: ldr r1, [r0, #32] -; CHECKBE-NEXT: vmov.f32 s8, s1 -; CHECKBE-NEXT: vmov.f32 s9, s2 -; CHECKBE-NEXT: vmov.f32 s10, s3 -; CHECKBE-NEXT: vmov.f32 s11, s4 +; CHECKBE-NEXT: vldrw.u32 q3, [r0] +; CHECKBE-NEXT: vldrb.u8 q0, [r0, #20] +; CHECKBE-NEXT: vldr s11, [r0, #16] +; CHECKBE-NEXT: vmov.f32 s8, s13 +; CHECKBE-NEXT: vrev64.8 q1, q0 +; CHECKBE-NEXT: vmov.f32 s9, s14 +; CHECKBE-NEXT: vmov.f32 s10, s15 ; CHECKBE-NEXT: vrev64.32 q0, q2 -; CHECKBE-NEXT: vmov.f32 s8, s5 -; CHECKBE-NEXT: vmov.f32 s9, s6 -; CHECKBE-NEXT: vmov.f32 s10, s7 -; CHECKBE-NEXT: vmov.32 q2[3], r1 -; CHECKBE-NEXT: vrev64.32 q1, q2 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 32 %l1 = load <8 x i32>, ptr %p @@ -340,37 +188,29 @@ define <8 x i32> @inserti32_first_multiuse(ptr %p) { ; CHECKLE-LABEL: inserti32_first_multiuse: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrw.u32 q1, [r0, #20] -; CHECKLE-NEXT: vldrw.u32 q0, [r0, #4] -; CHECKLE-NEXT: ldr r1, [r0] -; CHECKLE-NEXT: vmov.f32 s8, s3 -; CHECKLE-NEXT: vmov.f32 s9, s4 -; CHECKLE-NEXT: vmov.f32 s10, s5 -; CHECKLE-NEXT: vmov.f32 s11, s6 -; CHECKLE-NEXT: vadd.i32 q1, q1, q2 -; CHECKLE-NEXT: vmov.f32 s9, s0 -; CHECKLE-NEXT: vmov.f32 s10, s1 -; CHECKLE-NEXT: vmov.f32 s11, s2 -; CHECKLE-NEXT: vmov.32 q2[0], r1 -; CHECKLE-NEXT: vadd.i32 q0, q0, q2 +; CHECKLE-NEXT: vldrw.u32 q0, [r0, #20] +; CHECKLE-NEXT: vldrw.u32 q2, [r0, #4] +; CHECKLE-NEXT: vmov.f32 s4, s11 +; CHECKLE-NEXT: vmov.f32 s5, s0 +; CHECKLE-NEXT: vmov.f32 s6, s1 +; CHECKLE-NEXT: vmov.f32 s7, s2 +; CHECKLE-NEXT: vadd.i32 q1, q0, q1 +; CHECKLE-NEXT: vldrw.u32 q0, [r0] +; CHECKLE-NEXT: vadd.i32 q0, q2, q0 ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti32_first_multiuse: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrw.u32 q1, [r0, #20] -; CHECKBE-NEXT: vldrw.u32 q0, [r0, #4] -; CHECKBE-NEXT: ldr r1, [r0] -; CHECKBE-NEXT: vmov.f32 s8, s3 -; CHECKBE-NEXT: vmov.f32 s9, s4 -; CHECKBE-NEXT: vmov.f32 s10, s5 -; CHECKBE-NEXT: vmov.f32 s11, s6 -; CHECKBE-NEXT: vadd.i32 q2, q1, q2 -; CHECKBE-NEXT: vrev64.32 q1, q2 -; CHECKBE-NEXT: vmov.f32 s9, s0 -; CHECKBE-NEXT: vmov.f32 s10, s1 -; CHECKBE-NEXT: vmov.f32 s11, s2 -; CHECKBE-NEXT: vmov.32 q2[0], r1 -; CHECKBE-NEXT: vadd.i32 q2, q0, q2 +; CHECKBE-NEXT: vldrw.u32 q0, [r0, #20] +; CHECKBE-NEXT: vldrw.u32 q2, [r0, #4] +; CHECKBE-NEXT: vmov.f32 s4, s11 +; CHECKBE-NEXT: vmov.f32 s5, s0 +; CHECKBE-NEXT: vmov.f32 s6, s1 +; CHECKBE-NEXT: vmov.f32 s7, s2 +; CHECKBE-NEXT: vadd.i32 q0, q0, q1 +; CHECKBE-NEXT: vrev64.32 q1, q0 +; CHECKBE-NEXT: vldrw.u32 q0, [r0] +; CHECKBE-NEXT: vadd.i32 q2, q2, q0 ; CHECKBE-NEXT: vrev64.32 q0, q2 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 4 @@ -387,16 +227,12 @@ ; CHECKLE: @ %bb.0: ; CHECKLE-NEXT: vldrw.u32 q0, [r0] ; CHECKLE-NEXT: vldrw.u32 q1, [r0, #16] -; CHECKLE-NEXT: ldr r1, [r0, #32] ; CHECKLE-NEXT: vmov.f32 s8, s1 ; CHECKLE-NEXT: vmov.f32 s9, s2 ; CHECKLE-NEXT: vmov.f32 s10, s3 ; CHECKLE-NEXT: vmov.f32 s11, s4 ; CHECKLE-NEXT: vadd.i32 q0, q0, q2 -; CHECKLE-NEXT: vmov.f32 s8, s5 -; CHECKLE-NEXT: vmov.f32 s9, s6 -; CHECKLE-NEXT: vmov.f32 s10, s7 -; CHECKLE-NEXT: vmov.32 q2[3], r1 +; CHECKLE-NEXT: vldrw.u32 q2, [r0, #20] ; CHECKLE-NEXT: vadd.i32 q1, q1, q2 ; CHECKLE-NEXT: bx lr ; @@ -404,17 +240,13 @@ ; CHECKBE: @ %bb.0: ; CHECKBE-NEXT: vldrw.u32 q0, [r0] ; CHECKBE-NEXT: vldrw.u32 q1, [r0, #16] -; CHECKBE-NEXT: ldr r1, [r0, #32] ; CHECKBE-NEXT: vmov.f32 s8, s1 ; CHECKBE-NEXT: vmov.f32 s9, s2 ; CHECKBE-NEXT: vmov.f32 s10, s3 ; CHECKBE-NEXT: vmov.f32 s11, s4 ; CHECKBE-NEXT: vadd.i32 q2, q0, q2 ; CHECKBE-NEXT: vrev64.32 q0, q2 -; CHECKBE-NEXT: vmov.f32 s8, s5 -; CHECKBE-NEXT: vmov.f32 s9, s6 -; CHECKBE-NEXT: vmov.f32 s10, s7 -; CHECKBE-NEXT: vmov.32 q2[3], r1 +; CHECKBE-NEXT: vldrw.u32 q2, [r0, #20] ; CHECKBE-NEXT: vadd.i32 q2, q1, q2 ; CHECKBE-NEXT: vrev64.32 q1, q2 ; CHECKBE-NEXT: bx lr @@ -430,21 +262,13 @@ define <4 x float> @insertf32_first(ptr %p) { ; CHECKLE-LABEL: insertf32_first: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrw.u32 q1, [r0, #4] -; CHECKLE-NEXT: vldr s0, [r0] -; CHECKLE-NEXT: vmov.f32 s1, s4 -; CHECKLE-NEXT: vmov.f32 s2, s5 -; CHECKLE-NEXT: vmov.f32 s3, s6 +; CHECKLE-NEXT: vldrw.u32 q0, [r0] ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: insertf32_first: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrw.u32 q0, [r0, #4] -; CHECKBE-NEXT: vldr s4, [r0] -; CHECKBE-NEXT: vmov.f32 s5, s0 -; CHECKBE-NEXT: vmov.f32 s6, s1 -; CHECKBE-NEXT: vmov.f32 s7, s2 -; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: vldrb.u8 q1, [r0] +; CHECKBE-NEXT: vrev64.8 q0, q1 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 4 %l1 = load <4 x float>, ptr %q @@ -457,21 +281,13 @@ define <4 x float> @insertf32_last(ptr %p) { ; CHECKLE-LABEL: insertf32_last: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrw.u32 q1, [r0] -; CHECKLE-NEXT: vldr s3, [r0, #16] -; CHECKLE-NEXT: vmov.f32 s0, s5 -; CHECKLE-NEXT: vmov.f32 s1, s6 -; CHECKLE-NEXT: vmov.f32 s2, s7 +; CHECKLE-NEXT: vldrw.u32 q0, [r0, #4] ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: insertf32_last: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrw.u32 q0, [r0] -; CHECKBE-NEXT: vldr s7, [r0, #16] -; CHECKBE-NEXT: vmov.f32 s4, s1 -; CHECKBE-NEXT: vmov.f32 s5, s2 -; CHECKBE-NEXT: vmov.f32 s6, s3 -; CHECKBE-NEXT: vrev64.32 q0, q1 +; CHECKBE-NEXT: vldrb.u8 q1, [r0, #4] +; CHECKBE-NEXT: vrev64.8 q0, q1 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 16 %l1 = load <4 x float>, ptr %p