Index: llvm/lib/CodeGen/InterleavedAccessPass.cpp =================================================================== --- llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -66,6 +66,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Local.h" #include #include @@ -118,6 +119,14 @@ /// replacements are also performed. bool tryReplaceExtracts(ArrayRef Extracts, ArrayRef Shuffles); + + /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them + /// to binop(shuffle(x), shuffle(y)) to allow the formation of an + /// interleaving load. Any newly created shuffles that operate on \p LI will + /// be added to \p Shuffles. + bool tryReplaceBinOpShuffles(ArrayRef BinOpShuffles, + SmallVectorImpl &Shuffles, + LoadInst *LI); }; } // end anonymous namespace. @@ -285,59 +294,80 @@ SmallVector Shuffles; SmallVector Extracts; + SmallSetVector BinOpShuffles; // Check if all users of this load are shufflevectors. If we encounter any - // users that are extractelement instructions, we save them to later check if - // they can be modifed to extract from one of the shufflevectors instead of - // the load. - for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) { - auto *Extract = dyn_cast(*UI); + // users that are extractelement instructions or binary operators, we save + // them to later check if they can be modifed to extract from one of the + // shufflevectors instead of the load. + for (auto *User : LI->users()) { + auto *Extract = dyn_cast(User); if (Extract && isa(Extract->getIndexOperand())) { Extracts.push_back(Extract); continue; } - ShuffleVectorInst *SVI = dyn_cast(*UI); + auto *BI = dyn_cast(User); + if (BI && BI->hasOneUse()) { + if (auto SVI = dyn_cast(*BI->user_begin())) { + BinOpShuffles.insert(SVI); + continue; + } + } + ShuffleVectorInst *SVI = dyn_cast(User); if (!SVI || !isa(SVI->getOperand(1))) return false; Shuffles.push_back(SVI); } - if (Shuffles.empty()) + if (Shuffles.empty() && BinOpShuffles.empty()) return false; unsigned Factor, Index; unsigned NumLoadElements = cast(LI->getType())->getNumElements(); + auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0]; // Check if the first shufflevector is DE-interleave shuffle. - if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index, - MaxFactor, NumLoadElements)) + if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor, + NumLoadElements)) return false; // Holds the corresponding index for each DE-interleave shuffle. SmallVector Indices; - Indices.push_back(Index); - Type *VecTy = Shuffles[0]->getType(); + Type *VecTy = FirstSVI->getType(); // Check if other shufflevectors are also DE-interleaved of the same type // and factor as the first shufflevector. - for (unsigned i = 1; i < Shuffles.size(); i++) { + for (unsigned i = 0; i < Shuffles.size(); i++) { if (Shuffles[i]->getType() != VecTy) return false; - if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor, Index)) return false; Indices.push_back(Index); } + for (unsigned i = 0; i < BinOpShuffles.size(); i++) { + if (BinOpShuffles[i]->getType() != VecTy) + return false; + if (!isDeInterleaveMaskOfFactor(BinOpShuffles[i]->getShuffleMask(), Factor, + Index)) + return false; + + if (cast(BinOpShuffles[i]->getOperand(0))->getOperand(0) == LI) + Indices.push_back(Index); + if (cast(BinOpShuffles[i]->getOperand(0))->getOperand(1) == LI) + Indices.push_back(Index); + } // Try and modify users of the load that are extractelement instructions to // use the shufflevector instructions instead of the load. if (!tryReplaceExtracts(Extracts, Shuffles)) return false; + if (!tryReplaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI)) + return false; LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n"); @@ -352,6 +382,34 @@ return true; } +bool InterleavedAccess::tryReplaceBinOpShuffles( + ArrayRef BinOpShuffles, + SmallVectorImpl &Shuffles, LoadInst *LI) { + for (auto *SVI : BinOpShuffles) { + BinaryOperator *BI = cast(SVI->getOperand(0)); + ArrayRef Mask = SVI->getShuffleMask(); + + auto *NewSVI1 = new ShuffleVectorInst( + BI->getOperand(0), UndefValue::get(BI->getOperand(0)->getType()), Mask, + SVI->getName(), SVI); + auto *NewSVI2 = new ShuffleVectorInst( + BI->getOperand(1), UndefValue::get(BI->getOperand(1)->getType()), Mask, + SVI->getName(), SVI); + Value *NewBI = BinaryOperator::Create(BI->getOpcode(), NewSVI1, NewSVI2, + BI->getName(), SVI); + SVI->replaceAllUsesWith(NewBI); + LLVM_DEBUG(dbgs() << " Replaced: " << *BI << "\n And : " << *SVI + << "\n With : " << *NewSVI1 << "\n And : " + << *NewSVI2 << "\n And : " << *NewBI << "\n"); + RecursivelyDeleteTriviallyDeadInstructions(SVI); + if (NewSVI1->getOperand(0) == LI) + Shuffles.push_back(NewSVI1); + if (NewSVI2->getOperand(0) == LI) + Shuffles.push_back(NewSVI2); + } + return true; +} + bool InterleavedAccess::tryReplaceExtracts( ArrayRef Extracts, ArrayRef Shuffles) { Index: llvm/test/CodeGen/AArch64/vldn_shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -7,13 +7,10 @@ ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q1, q0, [x0], #32 -; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s -; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s -; CHECK-NEXT: uzp1 v2.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v0.4s, v1.4s, v0.4s -; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s -; CHECK-NEXT: str q0, [x1, x8] +; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 +; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s +; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s +; CHECK-NEXT: str q2, [x1, x8] ; CHECK-NEXT: add x8, x8, #16 // =16 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 ; CHECK-NEXT: b.ne .LBB0_1 @@ -50,27 +47,11 @@ ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q2, q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x0], #48 -; CHECK-NEXT: fmul v2.4s, v2.4s, v2.4s -; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s -; CHECK-NEXT: ext v3.16b, v2.16b, v1.16b, #8 -; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ext v5.16b, v1.16b, v3.16b, #12 -; CHECK-NEXT: ext v3.16b, v3.16b, v2.16b, #4 -; CHECK-NEXT: dup v4.4s, v0.s[1] -; CHECK-NEXT: mov v2.s[0], v1.s[2] -; CHECK-NEXT: dup v1.4s, v0.s[2] -; CHECK-NEXT: mov v0.s[2], v0.s[0] -; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #12 -; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #8 -; CHECK-NEXT: mov v5.s[3], v4.s[3] -; CHECK-NEXT: mov v3.s[3], v1.s[3] -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v1.4s, v3.4s, v5.4s -; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s -; CHECK-NEXT: str q0, [x1, x8] +; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48 +; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s +; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s +; CHECK-NEXT: str q3, [x1, x8] ; CHECK-NEXT: add x8, x8, #16 // =16 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 ; CHECK-NEXT: b.ne .LBB1_1 @@ -110,37 +91,15 @@ ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 ; CHECK-NEXT: add x9, x1, x8 ; CHECK-NEXT: add x8, x8, #32 // =32 -; CHECK-NEXT: fmul v3.4s, v3.4s, v3.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v2.4s -; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s -; CHECK-NEXT: zip1 v5.4s, v2.4s, v3.4s -; CHECK-NEXT: trn2 v7.4s, v2.4s, v3.4s -; CHECK-NEXT: zip1 v4.4s, v0.4s, v1.4s -; CHECK-NEXT: trn2 v6.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 -; CHECK-NEXT: ext v7.16b, v2.16b, v7.16b, #8 -; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v4.16b, v5.16b, v4.16b, #8 -; CHECK-NEXT: zip2 v5.4s, v2.4s, v3.4s -; CHECK-NEXT: ext v0.16b, v6.16b, v0.16b, #8 -; CHECK-NEXT: ext v6.16b, v7.16b, v6.16b, #8 -; CHECK-NEXT: mov v2.s[3], v3.s[2] -; CHECK-NEXT: ext v0.16b, v5.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v4.16b, v4.16b, #8 -; CHECK-NEXT: ext v4.16b, v6.16b, v6.16b, #8 -; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #8 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v2.4s, v4.4s, v3.4s -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 -; CHECK-NEXT: fadd v3.4s, v0.4s, v1.4s -; CHECK-NEXT: add x0, x0, #64 // =64 -; CHECK-NEXT: st2 { v2.4s, v3.4s }, [x9] +; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s +; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s +; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s +; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s +; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9] ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret @@ -184,16 +143,13 @@ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x0, x8 ; CHECK-NEXT: add x10, x1, x8 -; CHECK-NEXT: ldp q0, q1, [x9] -; CHECK-NEXT: ldp q3, q2, [x10] +; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x9] +; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x10] ; CHECK-NEXT: add x8, x8, #32 // =32 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 -; CHECK-NEXT: fmul v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fmul v0.4s, v3.4s, v0.4s -; CHECK-NEXT: uzp1 v2.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s -; CHECK-NEXT: str q0, [x2], #16 +; CHECK-NEXT: fmul v4.4s, v2.4s, v0.4s +; CHECK-NEXT: fmla v4.4s, v1.4s, v3.4s +; CHECK-NEXT: str q4, [x2], #16 ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret Index: llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll +++ llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll @@ -4,94 +4,49 @@ define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: arm_cmplx_mag_squared_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: beq.w .LBB0_9 +; CHECK-NEXT: beq .LBB0_8 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: cmp r2, #8 -; CHECK-NEXT: blo.w .LBB0_6 +; CHECK-NEXT: blo .LBB0_9 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck ; CHECK-NEXT: add.w r3, r0, r2, lsl #2 ; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: itt hi ; CHECK-NEXT: addhi.w r3, r1, r2, lsl #1 ; CHECK-NEXT: cmphi r3, r0 -; CHECK-NEXT: bhi .LBB0_6 +; CHECK-NEXT: bhi .LBB0_9 ; CHECK-NEXT: @ %bb.3: @ %vector.ph -; CHECK-NEXT: bic r5, r2, #7 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r3, r5, #8 -; CHECK-NEXT: and r8, r2, #7 -; CHECK-NEXT: add.w r12, r1, r5, lsl #1 -; CHECK-NEXT: add.w r3, r4, r3, lsr #3 -; CHECK-NEXT: mov r7, r3 -; CHECK-NEXT: add.w r3, r0, r5, lsl #2 +; CHECK-NEXT: bic r4, r2, #7 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r12, r4, #8 +; CHECK-NEXT: and r7, r2, #7 +; CHECK-NEXT: add.w r3, r3, r12, lsr #3 +; CHECK-NEXT: add.w r12, r1, r4, lsl #1 +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: add.w r3, r0, r4, lsl #2 ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q0, [r0], #32 -; CHECK-NEXT: mov lr, r7 +; CHECK-NEXT: vld20.16 {q0, q1}, [r0] +; CHECK-NEXT: mov lr, r5 ; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: vmul.f16 q1, q0, q0 -; CHECK-NEXT: mov r7, lr -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov.16 q0[0], r6 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov.16 q0[1], r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vmov.16 q0[2], r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmov.16 q0[3], r4 -; CHECK-NEXT: vmul.f16 q2, q2, q2 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmov.16 q0[4], r4 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vmov.16 q0[5], r4 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov.16 q3[0], r6 -; CHECK-NEXT: vmov.16 q0[6], r4 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov.16 q3[1], r4 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov.16 q3[2], r4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: vmov.16 q3[3], r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov.16 q3[4], r4 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: vmov.16 q3[5], r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov.16 q3[6], r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: vmovx.f16 s4, s11 -; CHECK-NEXT: vmov.16 q3[7], r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vadd.f16 q0, q0, q3 -; CHECK-NEXT: vstrb.8 q0, [r1], #16 +; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! +; CHECK-NEXT: mov r5, lr +; CHECK-NEXT: vmul.f16 q2, q0, q0 +; CHECK-NEXT: vfma.f16 q2, q1, q1 +; CHECK-NEXT: vstrb.8 q2, [r1], #16 ; CHECK-NEXT: bne .LBB0_4 ; CHECK-NEXT: b .LBB0_5 ; CHECK-NEXT: .LBB0_5: @ %middle.block -; CHECK-NEXT: cmp r5, r2 -; CHECK-NEXT: mov lr, r8 -; CHECK-NEXT: bne .LBB0_7 -; CHECK-NEXT: b .LBB0_9 -; CHECK-NEXT: .LBB0_6: -; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: mov lr, r2 -; CHECK-NEXT: .LBB0_7: @ %while.body.preheader26 +; CHECK-NEXT: cmp r4, r2 +; CHECK-NEXT: mov lr, r7 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r4, r5, r7, pc} +; CHECK-NEXT: .LBB0_6: @ %while.body.preheader26 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB0_8: @ %while.body +; CHECK-NEXT: .LBB0_7: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r3] ; CHECK-NEXT: vldr.16 s2, [r3, #2] @@ -100,9 +55,14 @@ ; CHECK-NEXT: vfma.f16 s0, s2, s2 ; CHECK-NEXT: vstr.16 s0, [r12] ; CHECK-NEXT: add.w r12, r12, #2 -; CHECK-NEXT: le lr, .LBB0_8 -; CHECK-NEXT: .LBB0_9: @ %while.end -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: le lr, .LBB0_7 +; CHECK-NEXT: .LBB0_8: @ %while.end +; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .LBB0_9: +; CHECK-NEXT: mov r3, r0 +; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: mov lr, r2 +; CHECK-NEXT: b .LBB0_6 entry: %cmp.not11 = icmp eq i32 %numSamples, 0 br i1 %cmp.not11, label %while.end, label %while.body.preheader @@ -195,37 +155,28 @@ ; CHECK-NEXT: cmphi r3, r0 ; CHECK-NEXT: bhi .LBB1_9 ; CHECK-NEXT: @ %bb.3: @ %vector.ph -; CHECK-NEXT: bic r5, r2, #3 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: subs r3, r5, #4 +; CHECK-NEXT: bic r4, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r12, r4, #4 ; CHECK-NEXT: and r7, r2, #3 -; CHECK-NEXT: add.w r12, r1, r5, lsl #2 -; CHECK-NEXT: add.w r3, r4, r3, lsr #2 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: add.w r3, r0, r5, lsl #3 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: add.w r12, r1, r4, lsl #2 +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: add.w r3, r0, r4, lsl #3 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0], #32 -; CHECK-NEXT: mov lr, r4 -; CHECK-NEXT: vmul.f32 q1, q1, q1 -; CHECK-NEXT: vmul.f32 q0, q0, q0 -; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: mov lr, r5 ; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: mov r4, lr -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s14, s1 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vadd.f32 q0, q3, q2 -; CHECK-NEXT: vstrb.8 q0, [r1], #16 +; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! +; CHECK-NEXT: mov r5, lr +; CHECK-NEXT: vmul.f32 q2, q0, q0 +; CHECK-NEXT: vfma.f32 q2, q1, q1 +; CHECK-NEXT: vstrb.8 q2, [r1], #16 ; CHECK-NEXT: bne .LBB1_4 ; CHECK-NEXT: b .LBB1_5 ; CHECK-NEXT: .LBB1_5: @ %middle.block -; CHECK-NEXT: cmp r5, r2 +; CHECK-NEXT: cmp r4, r2 ; CHECK-NEXT: mov lr, r7 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r7, pc}