Index: llvm/lib/CodeGen/InterleavedAccessPass.cpp =================================================================== --- llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -66,6 +66,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Local.h" #include #include @@ -118,6 +119,14 @@ /// replacements are also performed. bool tryReplaceExtracts(ArrayRef Extracts, ArrayRef Shuffles); + + /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them + /// to binop(shuffle(x), shuffle(y)) to allow the formation of an + /// interleaving load. Any newly created shuffles that operate on \p LI will + /// be added to \p Shuffles. + bool tryReplaceBinOpShuffles(ArrayRef BinOpShuffles, + SmallVectorImpl &Shuffles, + LoadInst *LI); }; } // end anonymous namespace. @@ -283,61 +292,85 @@ if (!LI->isSimple() || isa(LI->getType())) return false; + // Check if all users of this load are shufflevectors. If we encounter any + // users that are extractelement instructions or binary operators, we save + // them to later check if they can be modified to extract from one of the + // shufflevectors instead of the load. + SmallVector Shuffles; SmallVector Extracts; + // BinOpShuffles need to be handled a single time in case both operands of the + // binop are the same load. + SmallSetVector BinOpShuffles; - // Check if all users of this load are shufflevectors. If we encounter any - // users that are extractelement instructions, we save them to later check if - // they can be modifed to extract from one of the shufflevectors instead of - // the load. - for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) { - auto *Extract = dyn_cast(*UI); + for (auto *User : LI->users()) { + auto *Extract = dyn_cast(User); if (Extract && isa(Extract->getIndexOperand())) { Extracts.push_back(Extract); continue; } - ShuffleVectorInst *SVI = dyn_cast(*UI); + auto *BI = dyn_cast(User); + if (BI && BI->hasOneUse()) { + if (auto *SVI = dyn_cast(*BI->user_begin())) { + BinOpShuffles.insert(SVI); + continue; + } + } + auto *SVI = dyn_cast(User); if (!SVI || !isa(SVI->getOperand(1))) return false; Shuffles.push_back(SVI); } - if (Shuffles.empty()) + if (Shuffles.empty() && BinOpShuffles.empty()) return false; unsigned Factor, Index; unsigned NumLoadElements = cast(LI->getType())->getNumElements(); + auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0]; // Check if the first shufflevector is DE-interleave shuffle. - if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index, - MaxFactor, NumLoadElements)) + if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor, + NumLoadElements)) return false; // Holds the corresponding index for each DE-interleave shuffle. SmallVector Indices; - Indices.push_back(Index); - Type *VecTy = Shuffles[0]->getType(); + Type *VecTy = FirstSVI->getType(); // Check if other shufflevectors are also DE-interleaved of the same type // and factor as the first shufflevector. - for (unsigned i = 1; i < Shuffles.size(); i++) { - if (Shuffles[i]->getType() != VecTy) + for (auto *Shuffle : Shuffles) { + if (Shuffle->getType() != VecTy) return false; - - if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor, + if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor, Index)) return false; Indices.push_back(Index); } + for (auto *Shuffle : BinOpShuffles) { + if (Shuffle->getType() != VecTy) + return false; + if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor, + Index)) + return false; + + if (cast(Shuffle->getOperand(0))->getOperand(0) == LI) + Indices.push_back(Index); + if (cast(Shuffle->getOperand(0))->getOperand(1) == LI) + Indices.push_back(Index); + } // Try and modify users of the load that are extractelement instructions to // use the shufflevector instructions instead of the load. if (!tryReplaceExtracts(Extracts, Shuffles)) return false; + if (!tryReplaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI)) + return false; LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n"); @@ -352,6 +385,34 @@ return true; } +bool InterleavedAccess::tryReplaceBinOpShuffles( + ArrayRef BinOpShuffles, + SmallVectorImpl &Shuffles, LoadInst *LI) { + for (auto *SVI : BinOpShuffles) { + BinaryOperator *BI = cast(SVI->getOperand(0)); + ArrayRef Mask = SVI->getShuffleMask(); + + auto *NewSVI1 = new ShuffleVectorInst( + BI->getOperand(0), UndefValue::get(BI->getOperand(0)->getType()), Mask, + SVI->getName(), SVI); + auto *NewSVI2 = new ShuffleVectorInst( + BI->getOperand(1), UndefValue::get(BI->getOperand(1)->getType()), Mask, + SVI->getName(), SVI); + Value *NewBI = BinaryOperator::Create(BI->getOpcode(), NewSVI1, NewSVI2, + BI->getName(), SVI); + SVI->replaceAllUsesWith(NewBI); + LLVM_DEBUG(dbgs() << " Replaced: " << *BI << "\n And : " << *SVI + << "\n With : " << *NewSVI1 << "\n And : " + << *NewSVI2 << "\n And : " << *NewBI << "\n"); + RecursivelyDeleteTriviallyDeadInstructions(SVI); + if (NewSVI1->getOperand(0) == LI) + Shuffles.push_back(NewSVI1); + if (NewSVI2->getOperand(0) == LI) + Shuffles.push_back(NewSVI2); + } + return true; +} + bool InterleavedAccess::tryReplaceExtracts( ArrayRef Extracts, ArrayRef Shuffles) { @@ -421,7 +482,7 @@ if (!SI->isSimple()) return false; - ShuffleVectorInst *SVI = dyn_cast(SI->getValueOperand()); + auto *SVI = dyn_cast(SI->getValueOperand()); if (!SVI || !SVI->hasOneUse() || isa(SVI->getType())) return false; @@ -461,10 +522,10 @@ bool Changed = false; for (auto &I : instructions(F)) { - if (LoadInst *LI = dyn_cast(&I)) + if (auto *LI = dyn_cast(&I)) Changed |= lowerInterleavedLoad(LI, DeadInsts); - if (StoreInst *SI = dyn_cast(&I)) + if (auto *SI = dyn_cast(&I)) Changed |= lowerInterleavedStore(SI, DeadInsts); } Index: llvm/test/CodeGen/AArch64/vldn_shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -7,13 +7,10 @@ ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q1, q0, [x0], #32 -; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s -; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s -; CHECK-NEXT: uzp1 v2.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v0.4s, v1.4s, v0.4s -; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s -; CHECK-NEXT: str q0, [x1, x8] +; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 +; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s +; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s +; CHECK-NEXT: str q2, [x1, x8] ; CHECK-NEXT: add x8, x8, #16 // =16 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 ; CHECK-NEXT: b.ne .LBB0_1 @@ -50,27 +47,11 @@ ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q2, q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x0], #48 -; CHECK-NEXT: fmul v2.4s, v2.4s, v2.4s -; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s -; CHECK-NEXT: ext v3.16b, v2.16b, v1.16b, #8 -; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ext v5.16b, v1.16b, v3.16b, #12 -; CHECK-NEXT: ext v3.16b, v3.16b, v2.16b, #4 -; CHECK-NEXT: dup v4.4s, v0.s[1] -; CHECK-NEXT: mov v2.s[0], v1.s[2] -; CHECK-NEXT: dup v1.4s, v0.s[2] -; CHECK-NEXT: mov v0.s[2], v0.s[0] -; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #12 -; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #8 -; CHECK-NEXT: mov v5.s[3], v4.s[3] -; CHECK-NEXT: mov v3.s[3], v1.s[3] -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v1.4s, v3.4s, v5.4s -; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s -; CHECK-NEXT: str q0, [x1, x8] +; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48 +; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s +; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s +; CHECK-NEXT: str q3, [x1, x8] ; CHECK-NEXT: add x8, x8, #16 // =16 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 ; CHECK-NEXT: b.ne .LBB1_1 @@ -110,37 +91,15 @@ ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 ; CHECK-NEXT: add x9, x1, x8 ; CHECK-NEXT: add x8, x8, #32 // =32 -; CHECK-NEXT: fmul v3.4s, v3.4s, v3.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v2.4s -; CHECK-NEXT: fmul v1.4s, v1.4s, v1.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v0.4s -; CHECK-NEXT: zip1 v5.4s, v2.4s, v3.4s -; CHECK-NEXT: trn2 v7.4s, v2.4s, v3.4s -; CHECK-NEXT: zip1 v4.4s, v0.4s, v1.4s -; CHECK-NEXT: trn2 v6.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 -; CHECK-NEXT: ext v7.16b, v2.16b, v7.16b, #8 -; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v4.16b, v5.16b, v4.16b, #8 -; CHECK-NEXT: zip2 v5.4s, v2.4s, v3.4s -; CHECK-NEXT: ext v0.16b, v6.16b, v0.16b, #8 -; CHECK-NEXT: ext v6.16b, v7.16b, v6.16b, #8 -; CHECK-NEXT: mov v2.s[3], v3.s[2] -; CHECK-NEXT: ext v0.16b, v5.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v4.16b, v4.16b, #8 -; CHECK-NEXT: ext v4.16b, v6.16b, v6.16b, #8 -; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #8 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v2.4s, v4.4s, v3.4s -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 -; CHECK-NEXT: fadd v3.4s, v0.4s, v1.4s -; CHECK-NEXT: add x0, x0, #64 // =64 -; CHECK-NEXT: st2 { v2.4s, v3.4s }, [x9] +; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s +; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s +; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s +; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s +; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9] ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret @@ -184,16 +143,13 @@ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x0, x8 ; CHECK-NEXT: add x10, x1, x8 -; CHECK-NEXT: ldp q0, q1, [x9] -; CHECK-NEXT: ldp q3, q2, [x10] +; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x9] +; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x10] ; CHECK-NEXT: add x8, x8, #32 // =32 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 -; CHECK-NEXT: fmul v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fmul v0.4s, v3.4s, v0.4s -; CHECK-NEXT: uzp1 v2.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s -; CHECK-NEXT: str q0, [x2], #16 +; CHECK-NEXT: fmul v4.4s, v2.4s, v0.4s +; CHECK-NEXT: fmla v4.4s, v1.4s, v3.4s +; CHECK-NEXT: str q4, [x2], #16 ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret Index: llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll +++ llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll @@ -4,94 +4,49 @@ define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: arm_cmplx_mag_squared_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: beq.w .LBB0_9 +; CHECK-NEXT: beq .LBB0_8 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader ; CHECK-NEXT: cmp r2, #8 -; CHECK-NEXT: blo.w .LBB0_6 +; CHECK-NEXT: blo .LBB0_9 ; CHECK-NEXT: @ %bb.2: @ %vector.memcheck ; CHECK-NEXT: add.w r3, r0, r2, lsl #2 ; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: itt hi ; CHECK-NEXT: addhi.w r3, r1, r2, lsl #1 ; CHECK-NEXT: cmphi r3, r0 -; CHECK-NEXT: bhi .LBB0_6 +; CHECK-NEXT: bhi .LBB0_9 ; CHECK-NEXT: @ %bb.3: @ %vector.ph -; CHECK-NEXT: bic r5, r2, #7 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r3, r5, #8 -; CHECK-NEXT: and r8, r2, #7 -; CHECK-NEXT: add.w r12, r1, r5, lsl #1 -; CHECK-NEXT: add.w r3, r4, r3, lsr #3 -; CHECK-NEXT: mov r7, r3 -; CHECK-NEXT: add.w r3, r0, r5, lsl #2 +; CHECK-NEXT: bic r4, r2, #7 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r12, r4, #8 +; CHECK-NEXT: and r7, r2, #7 +; CHECK-NEXT: add.w r3, r3, r12, lsr #3 +; CHECK-NEXT: add.w r12, r1, r4, lsl #1 +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: add.w r3, r0, r4, lsl #2 ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q0, [r0], #32 -; CHECK-NEXT: mov lr, r7 +; CHECK-NEXT: vld20.16 {q0, q1}, [r0] +; CHECK-NEXT: mov lr, r5 ; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: vmul.f16 q1, q0, q0 -; CHECK-NEXT: mov r7, lr -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov.16 q0[0], r6 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov.16 q0[1], r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vmov.16 q0[2], r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmov.16 q0[3], r4 -; CHECK-NEXT: vmul.f16 q2, q2, q2 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmov.16 q0[4], r4 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vmov.16 q0[5], r4 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov.16 q3[0], r6 -; CHECK-NEXT: vmov.16 q0[6], r4 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov.16 q3[1], r4 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov.16 q3[2], r4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: vmov.16 q3[3], r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov.16 q3[4], r4 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: vmov.16 q3[5], r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov.16 q3[6], r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: vmovx.f16 s4, s11 -; CHECK-NEXT: vmov.16 q3[7], r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vadd.f16 q0, q0, q3 -; CHECK-NEXT: vstrb.8 q0, [r1], #16 +; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! +; CHECK-NEXT: mov r5, lr +; CHECK-NEXT: vmul.f16 q2, q0, q0 +; CHECK-NEXT: vfma.f16 q2, q1, q1 +; CHECK-NEXT: vstrb.8 q2, [r1], #16 ; CHECK-NEXT: bne .LBB0_4 ; CHECK-NEXT: b .LBB0_5 ; CHECK-NEXT: .LBB0_5: @ %middle.block -; CHECK-NEXT: cmp r5, r2 -; CHECK-NEXT: mov lr, r8 -; CHECK-NEXT: bne .LBB0_7 -; CHECK-NEXT: b .LBB0_9 -; CHECK-NEXT: .LBB0_6: -; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: mov lr, r2 -; CHECK-NEXT: .LBB0_7: @ %while.body.preheader26 +; CHECK-NEXT: cmp r4, r2 +; CHECK-NEXT: mov lr, r7 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r4, r5, r7, pc} +; CHECK-NEXT: .LBB0_6: @ %while.body.preheader26 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB0_8: @ %while.body +; CHECK-NEXT: .LBB0_7: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldr.16 s0, [r3] ; CHECK-NEXT: vldr.16 s2, [r3, #2] @@ -100,9 +55,14 @@ ; CHECK-NEXT: vfma.f16 s0, s2, s2 ; CHECK-NEXT: vstr.16 s0, [r12] ; CHECK-NEXT: add.w r12, r12, #2 -; CHECK-NEXT: le lr, .LBB0_8 -; CHECK-NEXT: .LBB0_9: @ %while.end -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: le lr, .LBB0_7 +; CHECK-NEXT: .LBB0_8: @ %while.end +; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .LBB0_9: +; CHECK-NEXT: mov r3, r0 +; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: mov lr, r2 +; CHECK-NEXT: b .LBB0_6 entry: %cmp.not11 = icmp eq i32 %numSamples, 0 br i1 %cmp.not11, label %while.end, label %while.body.preheader @@ -195,37 +155,28 @@ ; CHECK-NEXT: cmphi r3, r0 ; CHECK-NEXT: bhi .LBB1_9 ; CHECK-NEXT: @ %bb.3: @ %vector.ph -; CHECK-NEXT: bic r5, r2, #3 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: subs r3, r5, #4 +; CHECK-NEXT: bic r4, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r12, r4, #4 ; CHECK-NEXT: and r7, r2, #3 -; CHECK-NEXT: add.w r12, r1, r5, lsl #2 -; CHECK-NEXT: add.w r3, r4, r3, lsr #2 -; CHECK-NEXT: mov r4, r3 -; CHECK-NEXT: add.w r3, r0, r5, lsl #3 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: add.w r12, r1, r4, lsl #2 +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: add.w r3, r0, r4, lsl #3 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0], #32 -; CHECK-NEXT: mov lr, r4 -; CHECK-NEXT: vmul.f32 q1, q1, q1 -; CHECK-NEXT: vmul.f32 q0, q0, q0 -; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: mov lr, r5 ; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: mov r4, lr -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s14, s1 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vadd.f32 q0, q3, q2 -; CHECK-NEXT: vstrb.8 q0, [r1], #16 +; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! +; CHECK-NEXT: mov r5, lr +; CHECK-NEXT: vmul.f32 q2, q0, q0 +; CHECK-NEXT: vfma.f32 q2, q1, q1 +; CHECK-NEXT: vstrb.8 q2, [r1], #16 ; CHECK-NEXT: bne .LBB1_4 ; CHECK-NEXT: b .LBB1_5 ; CHECK-NEXT: .LBB1_5: @ %middle.block -; CHECK-NEXT: cmp r5, r2 +; CHECK-NEXT: cmp r4, r2 ; CHECK-NEXT: mov lr, r7 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r7, pc} Index: llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles.ll =================================================================== --- llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles.ll +++ llvm/test/Transforms/InterleavedAccess/AArch64/binopshuffles.ll @@ -7,12 +7,15 @@ define <4 x float> @vld2(<8 x float>* %pSrc) { ; CHECK-LABEL: @vld2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[PSRC:%.*]], align 4 -; CHECK-NEXT: [[L2:%.*]] = fmul fast <8 x float> [[WIDE_VEC]], [[WIDE_VEC]] -; CHECK-NEXT: [[L3:%.*]] = shufflevector <8 x float> [[L2]], <8 x float> undef, <4 x i32> -; CHECK-NEXT: [[L4:%.*]] = fmul fast <8 x float> [[WIDE_VEC]], [[WIDE_VEC]] -; CHECK-NEXT: [[L5:%.*]] = shufflevector <8 x float> [[L4]], <8 x float> undef, <4 x i32> -; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L5]], [[L3]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC:%.*]] to <4 x float>* +; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0 +; CHECK-NEXT: [[L26:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[L43:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L43]], [[L26]] ; CHECK-NEXT: ret <4 x float> [[L6]] ; entry: @@ -28,15 +31,19 @@ define <4 x float> @vld3(<12 x float>* %pSrc) { ; CHECK-LABEL: @vld3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x float>, <12 x float>* [[PSRC:%.*]], align 4 -; CHECK-NEXT: [[L2:%.*]] = fmul fast <12 x float> [[WIDE_VEC]], [[WIDE_VEC]] -; CHECK-NEXT: [[L3:%.*]] = shufflevector <12 x float> [[L2]], <12 x float> undef, <4 x i32> -; CHECK-NEXT: [[L4:%.*]] = fmul fast <12 x float> [[WIDE_VEC]], [[WIDE_VEC]] -; CHECK-NEXT: [[L5:%.*]] = shufflevector <12 x float> [[L4]], <12 x float> undef, <4 x i32> -; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L5]], [[L3]] -; CHECK-NEXT: [[L7:%.*]] = fmul fast <12 x float> [[WIDE_VEC]], [[WIDE_VEC]] -; CHECK-NEXT: [[L8:%.*]] = shufflevector <12 x float> [[L7]], <12 x float> undef, <4 x i32> -; CHECK-NEXT: [[L9:%.*]] = fadd fast <4 x float> [[L6]], [[L8]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <12 x float>* [[PSRC:%.*]] to <4 x float>* +; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0 +; CHECK-NEXT: [[L29:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[L46:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[L6:%.*]] = fadd fast <4 x float> [[L46]], [[L29]] +; CHECK-NEXT: [[L73:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[L9:%.*]] = fadd fast <4 x float> [[L6]], [[L73]] ; CHECK-NEXT: ret <4 x float> [[L9]] ; entry: @@ -55,17 +62,22 @@ define <4 x float> @vld4(<16 x float>* %pSrc) { ; CHECK-LABEL: @vld4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x float>, <16 x float>* [[PSRC:%.*]], align 4 -; CHECK-NEXT: [[L3:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]] -; CHECK-NEXT: [[L4:%.*]] = shufflevector <16 x float> [[L3]], <16 x float> undef, <4 x i32> -; CHECK-NEXT: [[L5:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]] -; CHECK-NEXT: [[L6:%.*]] = shufflevector <16 x float> [[L5]], <16 x float> undef, <4 x i32> -; CHECK-NEXT: [[L7:%.*]] = fadd fast <4 x float> [[L6]], [[L4]] -; CHECK-NEXT: [[L8:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]] -; CHECK-NEXT: [[L9:%.*]] = shufflevector <16 x float> [[L8]], <16 x float> undef, <4 x i32> -; CHECK-NEXT: [[L10:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[WIDE_VEC]] -; CHECK-NEXT: [[L11:%.*]] = shufflevector <16 x float> [[L10]], <16 x float> undef, <4 x i32> -; CHECK-NEXT: [[L12:%.*]] = fadd fast <4 x float> [[L11]], [[L9]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x float>* [[PSRC:%.*]] to <4 x float>* +; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 3 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[LDN]], 0 +; CHECK-NEXT: [[L312:%.*]] = fmul <4 x float> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[L59:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[L7:%.*]] = fadd fast <4 x float> [[L59]], [[L312]] +; CHECK-NEXT: [[L86:%.*]] = fmul <4 x float> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[L103:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[L12:%.*]] = fadd fast <4 x float> [[L103]], [[L86]] ; CHECK-NEXT: ret <4 x float> [[L12]] ; entry: @@ -86,13 +98,17 @@ define <4 x float> @twosrc(<8 x float>* %pSrc1, <8 x float>* %pSrc2) { ; CHECK-LABEL: @twosrc( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[PSRC1:%.*]], align 4 -; CHECK-NEXT: [[WIDE_VEC26:%.*]] = load <8 x float>, <8 x float>* [[PSRC2:%.*]], align 4 -; CHECK-NEXT: [[L4:%.*]] = fmul fast <8 x float> [[WIDE_VEC26]], [[WIDE_VEC]] -; CHECK-NEXT: [[L5:%.*]] = shufflevector <8 x float> [[L4]], <8 x float> undef, <4 x i32> -; CHECK-NEXT: [[L6:%.*]] = fmul fast <8 x float> [[WIDE_VEC26]], [[WIDE_VEC]] -; CHECK-NEXT: [[L7:%.*]] = shufflevector <8 x float> [[L6]], <8 x float> undef, <4 x i32> -; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L7]], [[L5]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC1:%.*]] to <4 x float>* +; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x float>* [[PSRC2:%.*]] to <4 x float>* +; CHECK-NEXT: [[LDN7:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN7]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN7]], 1 +; CHECK-NEXT: [[L46:%.*]] = fmul <4 x float> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[L63:%.*]] = fmul <4 x float> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L63]], [[L46]] ; CHECK-NEXT: ret <4 x float> [[L8]] ; entry: @@ -109,14 +125,17 @@ define <4 x float> @twosrc2(<8 x float>* %pSrc1, <8 x float>* %pSrc2) { ; CHECK-LABEL: @twosrc2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[PSRC1:%.*]], align 4 -; CHECK-NEXT: [[WIDE_VEC26:%.*]] = load <8 x float>, <8 x float>* [[PSRC2:%.*]], align 4 -; CHECK-NEXT: [[L4:%.*]] = fmul fast <8 x float> [[WIDE_VEC26]], [[WIDE_VEC]] -; CHECK-NEXT: [[L5:%.*]] = shufflevector <8 x float> [[L4]], <8 x float> undef, <4 x i32> -; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[WIDE_VEC26]], <8 x float> undef, <4 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> undef, <4 x i32> -; CHECK-NEXT: [[L6:%.*]] = fmul fast <4 x float> [[S1]], [[S2]] -; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L6]], [[L5]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float>* [[PSRC1:%.*]] to <4 x float>* +; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x float>* [[PSRC2:%.*]] to <4 x float>* +; CHECK-NEXT: [[LDN4:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN4]], 1 +; CHECK-NEXT: [[L43:%.*]] = fmul <4 x float> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[L6:%.*]] = fmul fast <4 x float> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[L8:%.*]] = fadd fast <4 x float> [[L6]], [[L43]] ; CHECK-NEXT: ret <4 x float> [[L8]] ; entry: