Index: lib/CodeGen/InterleavedAccessPass.cpp =================================================================== --- lib/CodeGen/InterleavedAccessPass.cpp +++ lib/CodeGen/InterleavedAccessPass.cpp @@ -40,6 +40,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" @@ -69,7 +70,7 @@ public: static char ID; InterleavedAccess(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM), TLI(nullptr) { + : FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) { initializeInterleavedAccessPass(*PassRegistry::getPassRegistry()); } @@ -77,7 +78,13 @@ bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + } + private: + DominatorTree *DT; const TargetMachine *TM; const TargetLowering *TLI; @@ -88,6 +95,13 @@ /// \brief Transform an interleaved store into target specific intrinsics. bool lowerInterleavedStore(StoreInst *SI, SmallVector &DeadInsts); + + /// \brief Returns true if the uses of an interleaved load by the + /// extractelement instructions in \p Extracts can be replaced by uses of the + /// shufflevector instructions in \p Shuffles instead. If so, the necessary + /// replacements are also performed. + bool canReplaceExtracts(ArrayRef Extracts, + ArrayRef Shuffles); }; } // end anonymous namespace. @@ -183,9 +197,18 @@ return false; SmallVector Shuffles; + SmallVector Extracts; - // Check if all users of this load are shufflevectors. + // Check if all users of this load are shufflevectors. If we encounter any + // users that are extractelement instructions, we save them to later check if + // they can be modifed to extract from one of the shufflevectors instead of + // the load. for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) { + auto *Extract = dyn_cast(*UI); + if (Extract && isa(Extract->getIndexOperand())) { + Extracts.push_back(Extract); + continue; + } ShuffleVectorInst *SVI = dyn_cast(*UI); if (!SVI || !isa(SVI->getOperand(1))) return false; @@ -221,6 +244,11 @@ Indices.push_back(Index); } + // Try and modify users of the load that are extractelement instructions to + // use the shufflevector instructions instead of the load. + if (!canReplaceExtracts(Extracts, Shuffles)) + return false; + DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n"); // Try to create target specific intrinsics to replace the load and shuffles. @@ -234,6 +262,73 @@ return true; } +bool InterleavedAccess::canReplaceExtracts( + ArrayRef Extracts, + ArrayRef Shuffles) { + + // If there aren't any extractelement instructions to modify, there's nothing + // to do. + if (Extracts.empty()) + return true; + + // Maps extractelement instructions to vector-index pairs. The extractlement + // instructions will be modified to use the new vector and index operands. + DenseMap> ReplacementMap; + + for (auto *Extract : Extracts) { + + // The vector index that is extracted. + auto *IndexOperand = cast(Extract->getIndexOperand()); + auto Index = IndexOperand->getSExtValue(); + + // Look for a suitable shufflevector instruction. The goal is to modify the + // extractelement instruction (which uses an interleaved load) to use one + // of the shufflevector instructions instead of the load. + for (auto *Shuffle : Shuffles) { + + // If the shufflevector instruction doesn't dominate the extract, we + // can't create a use of it. + if (!DT->dominates(Shuffle, Extract)) + continue; + + // Inspect the indices of the shufflevector instruction. If the shuffle + // selects the same index that is extracted, we can modify the + // extractelement instruction. + SmallVector Indices; + Shuffle->getShuffleMask(Indices); + for (unsigned I = 0; I < Indices.size(); ++I) + if (Indices[I] == Index) { + assert(Extract->getOperand(0) == Shuffle->getOperand(0) && + "Vector operations do not match"); + ReplacementMap[Extract] = std::make_pair(Shuffle, I); + break; + } + + // If we found a suitable shufflevector instruction, stop looking. + if (ReplacementMap.count(Extract)) + break; + } + + // If we did not find a suitable shufflevector instruction, the + // extractelement instruction cannot be modified, so we must give up. + if (!ReplacementMap.count(Extract)) + return false; + } + + // Finally, perform the replacements. + for (auto &Replacement : ReplacementMap) { + auto *Extract = Replacement.first; + auto *Vector = Replacement.second.first; + auto *Index = ConstantInt::get(Type::getInt32Ty(Vector->getContext()), + Replacement.second.second); + Extract->replaceAllUsesWith( + ExtractElementInst::Create(Vector, Index, "", Extract)); + Extract->eraseFromParent(); + } + + return true; +} + bool InterleavedAccess::lowerInterleavedStore( StoreInst *SI, SmallVector &DeadInsts) { if (!SI->isSimple()) @@ -266,6 +361,7 @@ DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n"); + DT = &getAnalysis().getDomTree(); TLI = TM->getSubtargetImpl(F)->getTargetLowering(); MaxFactor = TLI->getMaxSupportedInterleaveFactor(); Index: test/CodeGen/AArch64/aarch64-interleaved-accesses.ll =================================================================== --- test/CodeGen/AArch64/aarch64-interleaved-accesses.ll +++ test/CodeGen/AArch64/aarch64-interleaved-accesses.ll @@ -268,3 +268,15 @@ store <3 x float> %tmp1, <3 x float>* %p, align 16 ret void } + +; NEON-LABEL: load_factor2_with_extract_user: +; NEON: ld2 { v0.4s, v1.4s }, [x0] +; NEON: mov w0, v0.s[1] +; NONEON-LABEL: load_factor2_with_extract_user: +; NONEON-NOT: ld2 +define i32 @load_factor2_with_extract_user(<8 x i32>* %a) { + %1 = load <8 x i32>, <8 x i32>* %a, align 8 + %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> + %3 = extractelement <8 x i32> %1, i32 2 + ret i32 %3 +} Index: test/CodeGen/ARM/arm-interleaved-accesses.ll =================================================================== --- test/CodeGen/ARM/arm-interleaved-accesses.ll +++ test/CodeGen/ARM/arm-interleaved-accesses.ll @@ -304,3 +304,15 @@ store <3 x float> %tmp1, <3 x float>* %p, align 16 ret void } + +; NEON-LABEL: load_factor2_with_extract_user: +; NEON: vld2.32 {d16, d17, d18, d19}, [r0:64] +; NEON: vmov.32 r0, d16[1] +; NONEON-LABEL: load_factor2_with_extract_user: +; NONEON-NOT: vld2 +define i32 @load_factor2_with_extract_user(<8 x i32>* %a) { + %1 = load <8 x i32>, <8 x i32>* %a, align 8 + %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> + %3 = extractelement <8 x i32> %1, i32 2 + ret i32 %3 +}