Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -157,6 +157,7 @@ void initializeInstructionCombiningPassPass(PassRegistry&); void initializeInstCountPass(PassRegistry&); void initializeInstNamerPass(PassRegistry&); +void initializeInterleavedAccessPass(PassRegistry&); void initializeInternalizeLegacyPassPass(PassRegistry&); void initializeIntervalPartitionPass(PassRegistry&); void initializeIRTranslatorPass(PassRegistry &); Index: lib/CodeGen/CodeGen.cpp =================================================================== --- lib/CodeGen/CodeGen.cpp +++ lib/CodeGen/CodeGen.cpp @@ -34,6 +34,7 @@ initializeGCMachineCodeAnalysisPass(Registry); initializeGCModuleInfoPass(Registry); initializeIfConverterPass(Registry); + initializeInterleavedAccessPass(Registry); initializeLiveDebugVariablesPass(Registry); initializeLiveIntervalsPass(Registry); initializeLiveStacksPass(Registry); Index: lib/CodeGen/InterleavedAccessPass.cpp =================================================================== --- lib/CodeGen/InterleavedAccessPass.cpp +++ lib/CodeGen/InterleavedAccessPass.cpp @@ -40,6 +40,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" @@ -58,10 +59,6 @@ static unsigned MaxFactor; // The maximum supported interleave factor. -namespace llvm { -static void initializeInterleavedAccessPass(PassRegistry &); -} - namespace { class InterleavedAccess : public FunctionPass { @@ -69,7 +66,7 @@ public: static char ID; InterleavedAccess(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM), TLI(nullptr) { + : FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) { initializeInterleavedAccessPass(*PassRegistry::getPassRegistry()); } @@ -77,7 +74,13 @@ bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + } + private: + DominatorTree *DT; const TargetMachine *TM; const TargetLowering *TLI; @@ -88,13 +91,26 @@ /// \brief Transform an interleaved store into target specific intrinsics. bool lowerInterleavedStore(StoreInst *SI, SmallVector &DeadInsts); + + /// \brief Returns true if the uses of an interleaved load by the + /// extractelement instructions in \p Extracts can be replaced by uses of the + /// shufflevector instructions in \p Shuffles instead. If so, the necessary + /// replacements are also performed. + bool tryReplaceExtracts(ArrayRef Extracts, + ArrayRef Shuffles); }; } // end anonymous namespace. char InterleavedAccess::ID = 0; -INITIALIZE_TM_PASS(InterleavedAccess, "interleaved-access", - "Lower interleaved memory accesses to target specific intrinsics", - false, false) +INITIALIZE_TM_PASS_BEGIN( + InterleavedAccess, "interleaved-access", + "Lower interleaved memory accesses to target specific intrinsics", false, + false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_TM_PASS_END( + InterleavedAccess, "interleaved-access", + "Lower interleaved memory accesses to target specific intrinsics", false, + false) FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) { return new InterleavedAccess(TM); @@ -183,9 +199,18 @@ return false; SmallVector Shuffles; + SmallVector Extracts; - // Check if all users of this load are shufflevectors. + // Check if all users of this load are shufflevectors. If we encounter any + // users that are extractelement instructions, we save them to later check if + // they can be modifed to extract from one of the shufflevectors instead of + // the load. for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) { + auto *Extract = dyn_cast(*UI); + if (Extract && isa(Extract->getIndexOperand())) { + Extracts.push_back(Extract); + continue; + } ShuffleVectorInst *SVI = dyn_cast(*UI); if (!SVI || !isa(SVI->getOperand(1))) return false; @@ -221,6 +246,11 @@ Indices.push_back(Index); } + // Try and modify users of the load that are extractelement instructions to + // use the shufflevector instructions instead of the load. + if (!tryReplaceExtracts(Extracts, Shuffles)) + return false; + DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n"); // Try to create target specific intrinsics to replace the load and shuffles. @@ -234,6 +264,73 @@ return true; } +bool InterleavedAccess::tryReplaceExtracts( + ArrayRef Extracts, + ArrayRef Shuffles) { + + // If there aren't any extractelement instructions to modify, there's nothing + // to do. + if (Extracts.empty()) + return true; + + // Maps extractelement instructions to vector-index pairs. The extractlement + // instructions will be modified to use the new vector and index operands. + DenseMap> ReplacementMap; + + for (auto *Extract : Extracts) { + + // The vector index that is extracted. + auto *IndexOperand = cast(Extract->getIndexOperand()); + auto Index = IndexOperand->getSExtValue(); + + // Look for a suitable shufflevector instruction. The goal is to modify the + // extractelement instruction (which uses an interleaved load) to use one + // of the shufflevector instructions instead of the load. + for (auto *Shuffle : Shuffles) { + + // If the shufflevector instruction doesn't dominate the extract, we + // can't create a use of it. + if (!DT->dominates(Shuffle, Extract)) + continue; + + // Inspect the indices of the shufflevector instruction. If the shuffle + // selects the same index that is extracted, we can modify the + // extractelement instruction. + SmallVector Indices; + Shuffle->getShuffleMask(Indices); + for (unsigned I = 0; I < Indices.size(); ++I) + if (Indices[I] == Index) { + assert(Extract->getOperand(0) == Shuffle->getOperand(0) && + "Vector operations do not match"); + ReplacementMap[Extract] = std::make_pair(Shuffle, I); + break; + } + + // If we found a suitable shufflevector instruction, stop looking. + if (ReplacementMap.count(Extract)) + break; + } + + // If we did not find a suitable shufflevector instruction, the + // extractelement instruction cannot be modified, so we must give up. + if (!ReplacementMap.count(Extract)) + return false; + } + + // Finally, perform the replacements. + IRBuilder<> Builder(Extracts[0]->getContext()); + for (auto &Replacement : ReplacementMap) { + auto *Extract = Replacement.first; + auto *Vector = Replacement.second.first; + auto Index = Replacement.second.second; + Builder.SetInsertPoint(Extract); + Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index)); + Extract->eraseFromParent(); + } + + return true; +} + bool InterleavedAccess::lowerInterleavedStore( StoreInst *SI, SmallVector &DeadInsts) { if (!SI->isSimple()) @@ -266,6 +363,7 @@ DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n"); + DT = &getAnalysis().getDomTree(); TLI = TM->getSubtargetImpl(F)->getTargetLowering(); MaxFactor = TLI->getMaxSupportedInterleaveFactor(); Index: test/CodeGen/AArch64/aarch64-interleaved-accesses-extract-user.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-interleaved-accesses-extract-user.ll @@ -0,0 +1,86 @@ +; RUN: opt < %s -mtriple=aarch64 -interleaved-access -S | FileCheck %s + +; CHECK-LABEL: @extract_user_basic( +; CHECK: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* {{.*}}) +; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %ldN, 0 +; CHECK: extractelement <4 x i32> %[[R]], i64 1 +define void @extract_user_basic(<8 x i32>* %A, i1 %C) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + br i1 %C, label %if.then, label %if.merge + +if.then: + %E = extractelement <8 x i32> %L, i32 2 + br label %if.merge + +if.merge: + ret void +} + +; CHECK-LABEL: @extract_user_multi( +; CHECK: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32 +; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %ldN, 0 +; CHECK: extractelement <4 x i32> %[[R]], i64 0 +; CHECK: extractelement <4 x i32> %[[R]], i64 1 +define void @extract_user_multi(<8 x i32>* %A, i1 %C) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + br i1 %C, label %if.then, label %if.merge + +if.then: + %E1 = extractelement <8 x i32> %L, i32 0 + br label %if.merge + +if.merge: + %E2 = extractelement <8 x i32> %L, i32 2 + ret void +} + +; CHECK-LABEL: @extract_user_multi_no_dom( +; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32 +define void @extract_user_multi_no_dom(<8 x i32>* %A, i1 %C) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %E1 = extractelement <8 x i32> %L, i32 0 + br i1 %C, label %if.then, label %if.merge + +if.then: + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + %E2 = extractelement <8 x i32> %L, i32 2 + br label %if.merge + +if.merge: + ret void +} + +; CHECK-LABEL: @extract_user_wrong_const_index( +; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32 +define void @extract_user_wrong_const_index(<8 x i32>* %A) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + %E = extractelement <8 x i32> %L, i32 1 + ret void +} + +; CHECK-LABEL: @extract_user_undef_index( +; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32 +define void @extract_user_undef_index(<8 x i32>* %A) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + %E = extractelement <8 x i32> %L, i32 undef + ret void +} + +; CHECK-LABEL: @extract_user_var_index( +; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32 +define void @extract_user_var_index(<8 x i32>* %A, i32 %I) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + %E = extractelement <8 x i32> %L, i32 %I + ret void +} Index: test/CodeGen/AArch64/aarch64-interleaved-accesses.ll =================================================================== --- test/CodeGen/AArch64/aarch64-interleaved-accesses.ll +++ test/CodeGen/AArch64/aarch64-interleaved-accesses.ll @@ -268,3 +268,15 @@ store <3 x float> %tmp1, <3 x float>* %p, align 16 ret void } + +; NEON-LABEL: load_factor2_with_extract_user: +; NEON: ld2 { v0.4s, v1.4s }, [x0] +; NEON: mov w0, v0.s[1] +; NONEON-LABEL: load_factor2_with_extract_user: +; NONEON-NOT: ld2 +define i32 @load_factor2_with_extract_user(<8 x i32>* %a) { + %1 = load <8 x i32>, <8 x i32>* %a, align 8 + %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> + %3 = extractelement <8 x i32> %1, i32 2 + ret i32 %3 +} Index: test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll @@ -0,0 +1,86 @@ +; RUN: opt < %s -mtriple=arm-eabi -mattr=+neon -interleaved-access -S | FileCheck %s + +; CHECK-LABEL: @extract_user_basic( +; CHECK: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8 +; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %vldN, 0 +; CHECK: extractelement <4 x i32> %[[R]], i64 1 +define void @extract_user_basic(<8 x i32>* %A, i1 %C) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + br i1 %C, label %if.then, label %if.merge + +if.then: + %E = extractelement <8 x i32> %L, i32 2 + br label %if.merge + +if.merge: + ret void +} + +; CHECK-LABEL: @extract_user_multi( +; CHECK: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8 +; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %vldN, 0 +; CHECK: extractelement <4 x i32> %[[R]], i64 0 +; CHECK: extractelement <4 x i32> %[[R]], i64 1 +define void @extract_user_multi(<8 x i32>* %A, i1 %C) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + br i1 %C, label %if.then, label %if.merge + +if.then: + %E1 = extractelement <8 x i32> %L, i32 0 + br label %if.merge + +if.merge: + %E2 = extractelement <8 x i32> %L, i32 2 + ret void +} + +; CHECK-LABEL: @extract_user_multi_no_dom( +; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8 +define void @extract_user_multi_no_dom(<8 x i32>* %A, i1 %C) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %E1 = extractelement <8 x i32> %L, i32 0 + br i1 %C, label %if.then, label %if.merge + +if.then: + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + %E2 = extractelement <8 x i32> %L, i32 2 + br label %if.merge + +if.merge: + ret void +} + +; CHECK-LABEL: @extract_user_wrong_const_index( +; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8 +define void @extract_user_wrong_const_index(<8 x i32>* %A) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + %E = extractelement <8 x i32> %L, i32 1 + ret void +} + +; CHECK-LABEL: @extract_user_undef_index( +; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8 +define void @extract_user_undef_index(<8 x i32>* %A) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + %E = extractelement <8 x i32> %L, i32 undef + ret void +} + +; CHECK-LABEL: @extract_user_var_index( +; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8 +define void @extract_user_var_index(<8 x i32>* %A, i32 %I) { +entry: + %L = load <8 x i32>, <8 x i32>* %A, align 8 + %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> + %E = extractelement <8 x i32> %L, i32 %I + ret void +} Index: test/CodeGen/ARM/arm-interleaved-accesses.ll =================================================================== --- test/CodeGen/ARM/arm-interleaved-accesses.ll +++ test/CodeGen/ARM/arm-interleaved-accesses.ll @@ -304,3 +304,15 @@ store <3 x float> %tmp1, <3 x float>* %p, align 16 ret void } + +; NEON-LABEL: load_factor2_with_extract_user: +; NEON: vld2.32 {d16, d17, d18, d19}, [r0:64] +; NEON: vmov.32 r0, d16[1] +; NONEON-LABEL: load_factor2_with_extract_user: +; NONEON-NOT: vld2 +define i32 @load_factor2_with_extract_user(<8 x i32>* %a) { + %1 = load <8 x i32>, <8 x i32>* %a, align 8 + %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> + %3 = extractelement <8 x i32> %1, i32 2 + ret i32 %3 +} Index: tools/opt/opt.cpp =================================================================== --- tools/opt/opt.cpp +++ tools/opt/opt.cpp @@ -363,6 +363,7 @@ initializeSafeStackPass(Registry); initializeSjLjEHPreparePass(Registry); initializePreISelIntrinsicLoweringPass(Registry); + initializeInterleavedAccessPass(Registry); #ifdef LINK_POLLY_INTO_TOOLS polly::initializePollyPasses(Registry);