Index: include/llvm/Analysis/VectorUtils.h =================================================================== --- include/llvm/Analysis/VectorUtils.h +++ include/llvm/Analysis/VectorUtils.h @@ -272,7 +272,7 @@ /// Get the index for the given member. Unlike the key in the member /// map, the index starts from 0. - unsigned getIndex(InstTy *Instr) const { + unsigned getIndex(const InstTy *Instr) const { for (auto I : Members) { if (I.second == Instr) return I.first - SmallestKey; @@ -350,7 +350,8 @@ /// Get the interleave group that \p Instr belongs to. /// /// \returns nullptr if doesn't have such group. - InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { + InterleaveGroup * + getInterleaveGroup(const Instruction *Instr) const { if (InterleaveGroupMap.count(Instr)) return InterleaveGroupMap.find(Instr)->second; return nullptr; Index: lib/Transforms/Vectorize/CMakeLists.txt =================================================================== --- lib/Transforms/Vectorize/CMakeLists.txt +++ lib/Transforms/Vectorize/CMakeLists.txt @@ -7,6 +7,7 @@ VPlan.cpp VPlanHCFGBuilder.cpp VPlanHCFGTransforms.cpp + VPlanSLP.cpp VPlanVerifier.cpp ADDITIONAL_HEADER_DIRS Index: lib/Transforms/Vectorize/VPlan.h =================================================================== --- lib/Transforms/Vectorize/VPlan.h +++ lib/Transforms/Vectorize/VPlan.h @@ -60,6 +60,7 @@ class VPBasicBlock; class VPRegionBlock; class VPlan; +class VPlanSlp; /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: @@ -606,10 +607,15 @@ /// the VPInstruction is also a single def-use vertex. class VPInstruction : public VPUser, public VPRecipeBase { friend class VPlanHCFGTransforms; + friend class VPlanSlp; public: /// VPlan opcodes, extending LLVM IR with idiomatics instructions. - enum { Not = Instruction::OtherOpsEnd + 1 }; + enum { + Not = Instruction::OtherOpsEnd + 1, + SLPLoad, + SLPStore, + }; private: typedef unsigned char OpcodeTy; @@ -619,6 +625,13 @@ /// modeled instruction. void generateInstruction(VPTransformState &State, unsigned Part); +protected: + Instruction *getUnderlyingInstr() { + return cast_or_null(getUnderlyingValue()); + } + + void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } + public: VPInstruction(unsigned Opcode, ArrayRef Operands) : VPUser(VPValue::VPInstructionSC, Operands), @@ -632,6 +645,11 @@ return V->getVPValueID() == VPValue::VPInstructionSC; } + VPInstruction *clone() const { + SmallVector Operands(operands()); + return new VPInstruction(Opcode, Operands); + } + /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *R) { return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC; @@ -649,6 +667,14 @@ /// Print the VPInstruction. void print(raw_ostream &O) const; + + /// Return true if this instruction may modify memory. + bool mayWriteToMemory() const { + // TODO: we can use attributes of the called function to rule out memory + // modifications. + return Opcode == Instruction::Store || Opcode == Instruction::Call || + Opcode == Instruction::Invoke || Opcode == SLPStore; + } }; /// VPWidenRecipe is a recipe for producing a copy of vector type for each @@ -1473,6 +1499,81 @@ } }; +/// Class that maps (parts of) an existing VPlan to trees of combined +/// VPInstructions. +class VPlanSlp { +private: + enum class OpMode { Failed, Load, Opcode }; + + /// Mapping of values in the original VPlan to a combined VPInstruction. + DenseMap, VPInstruction *> BundleToCombined; + + VPInterleavedAccessInfo &IAI; + + /// Basic block to operate on. For now, only instructions in a single BB are + /// considered. + const VPBasicBlock &BB; + + /// Indicates whether we managed to combine all visited instructions or not. + bool CompletelySLP = true; + + /// Width of the widest combined bundle in bits. + unsigned WidestBundleBits = 0; + + using MultiNodeOpTy = + typename std::pair>; + + // Input operand bundles for the current multi node. Each multi node operand + // bundle contains values not matching the multi node's opcode. They will + // be reordered in reorderMultiNodeOps, once we completed building a + // multi node. + SmallVector MultiNodeOps; + + /// Indicates whether we are building a multi node currently. + bool MultiNodeActive = false; + + /// Check if we can vectorize Operands together. + bool areVectorizable(ArrayRef Operands) const; + + /// Add combined instruction \p New for the bundle \p Operands. + void addCombined(ArrayRef Operands, VPInstruction *New); + + /// Indicate we hit a bundle we failed to combine. Returns a dummy + /// instruction. + VPInstruction *markFailed(); + + /// Reorder operands in the multi node to maximize sequential memory access + /// and commutative operations. + SmallVector reorderMultiNodeOps(); + + /// Choose the best candidate to use for the lane after \p Last. The set of + /// candidates to choose from are values with an opcode matching \p Last's + /// or loads consecutive to \p Last. + std::pair getBest(OpMode Mode, VPValue *Last, + SmallVectorImpl &Candidates, + VPInterleavedAccessInfo &IAI); + + /// Print bundle \p Values to dbgs(). + void dumpBundle(ArrayRef Values); + +public: + VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} + + ~VPlanSlp() { + for (auto &KV : BundleToCombined) + delete KV.second; + } + + /// Tries to build an SLP tree rooted at \p Operands and returns a + /// VPInstruction combining \p Operands, if they can be combined. + VPInstruction *buildGraph(ArrayRef Operands); + + /// Return the width of the widest combined bundle in bits. + unsigned getWidestBundleBits() const { return WidestBundleBits; } + + /// Return true if all visited instruction can be combined. + bool isCompletelySLP() const { return CompletelySLP; } +}; } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H Index: lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- lib/Transforms/Vectorize/VPlan.cpp +++ lib/Transforms/Vectorize/VPlan.cpp @@ -328,6 +328,12 @@ case VPInstruction::Not: O << "not"; break; + case VPInstruction::SLPLoad: + O << "combined load"; + break; + case VPInstruction::SLPStore: + O << "combined store"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -657,6 +663,13 @@ template void DomTreeBuilder::Calculate(VPDominatorTree &DT); +void VPValue::replaceAllUsesWith(VPValue *New) { + for (VPUser *User : users()) + for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) + if (User->getOperand(I) == this) + User->setOperand(I, New); +} + VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI) { DenseMap *, InterleaveGroup *> @@ -669,7 +682,7 @@ for (auto I = VPBB->begin(), E = VPBB->end(); I != E; I++) { assert(isa(&*I) && "Can only handle VPInstructions"); VPInstruction *VPInst = cast(&*I); - Instruction *Inst = cast(VPInst->getUnderlyingValue()); + const auto *Inst = cast(VPInst->getUnderlyingValue()); auto *IG = IAI.getInterleaveGroup(Inst); if (!IG) continue; Index: lib/Transforms/Vectorize/VPlanSLP.cpp =================================================================== --- /dev/null +++ lib/Transforms/Vectorize/VPlanSLP.cpp @@ -0,0 +1,460 @@ +//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// This file implements SLP analysis based on VPlan. The analysis is based on +/// the ideas described in +/// +/// Look-ahead SLP: auto-vectorization in the presence of commutative +/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, +/// Luís F. W. Góes +/// +//===----------------------------------------------------------------------===// + +#include "VPlan.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "vplan-slp" + +// Number of levels to look ahead when re-ordering multi node operands. +static unsigned LookaheadMaxDepth = 5; + +VPInstruction *VPlanSlp::markFailed() { + // FIXME: Currently this is used to signal we hit instructions we cannot + // trivially SLP'ize. + CompletelySLP = false; + return new VPInstruction(0, {}); +} + +void VPlanSlp::addCombined(ArrayRef Operands, VPInstruction *New) { + if (all_of(Operands, [](VPValue *V) { + return cast(V)->getUnderlyingInstr(); + })) { + unsigned BundleSize = 0; + for (VPValue *V : Operands) { + Type *T = cast(V)->getUnderlyingInstr()->getType(); + assert(!T->isVectorTy() && "Only scalar types supported for now"); + BundleSize += T->getScalarSizeInBits(); + } + WidestBundleBits = std::max(WidestBundleBits, BundleSize); + } + + auto Res = BundleToCombined.insert({to_vector<4>(Operands), New}); + assert(Res.second && + "Already created a combined instruction for the operand bundle"); + (void)Res; +} + +bool VPlanSlp::areVectorizable(ArrayRef Operands) const { + // Currently we only support VPInstructions. + if (!all_of(Operands, [](VPValue *Op) { + return Op && isa(Op) && + cast(Op)->getUnderlyingInstr(); + })) { + LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); + return false; + } + + // Check if opcodes and type width agree for all instructions in the bundle. + // FIXME: Differing widths/opcodes can be handled by inserting additional + // instructions. + // FIXME: Deal with non-primitive types. + const Instruction *OriginalInstr = + cast(Operands[0])->getUnderlyingInstr(); + unsigned Opcode = OriginalInstr->getOpcode(); + unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); + if (!all_of(Operands, [Opcode, Width](VPValue *Op) { + const Instruction *I = cast(Op)->getUnderlyingInstr(); + return I->getOpcode() == Opcode && + I->getType()->getPrimitiveSizeInBits() == Width; + })) { + LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); + return false; + } + + // For now, all operands must be defined in the same BB. + if (any_of(Operands, [this, &Operands](VPValue *Op) { + return cast(Op)->getParent() != &this->BB; + })) { + LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); + return false; + } + + if (any_of(Operands, + [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { + LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); + return false; + } + + // For loads, check that there are no stores between them. + // FIXME: we only have to forbid stores that could interfere with any of the + // loads in the bundle + // FIXME: we do not have OrderedVPBBs, so for now we just have to scan a BB + // until we find the first load in the bundle and then check for + // stores until we find the last load. + if (Opcode == Instruction::Load) { + VPBasicBlock *Parent = cast(Operands[0])->getParent(); + if (any_of(Operands, [Parent](VPValue *V) { + return cast(V)->getParent() != Parent; + })) { + LLVM_DEBUG(dbgs() << "VPSLP: not all in same BB\n"); + return false; + } + unsigned LoadsSeen = 0; + for (auto &I : *Parent) { + auto *VPI = cast(&I); + if (VPI->getOpcode() == Instruction::Load && + std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) + LoadsSeen++; + + if (LoadsSeen == Operands.size()) + break; + + if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { + LLVM_DEBUG( + dbgs() << "VPSLP: instruction modifying memory between loads\n"); + return false; + } + } + } + return true; +} + +static SmallVector getOperands(ArrayRef Values, + unsigned OperandIndex) { + SmallVector Operands; + for (VPValue *V : Values) { + auto *U = cast(V); + Operands.push_back(U->getOperand(OperandIndex)); + } + return Operands; +} + +static bool areCommutative(ArrayRef Values) { + return Instruction::isCommutative( + cast(Values[0])->getOpcode()); +} + +static SmallVector, 4> +getOperands(ArrayRef Values) { + SmallVector, 4> Result; + auto *VPI = cast(Values[0]); + + switch (VPI->getOpcode()) { + case Instruction::Load: + llvm_unreachable("Loads terminate a tree, no need to get operands"); + case Instruction::Store: + Result.push_back(getOperands(Values, 0)); + break; + default: + for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) + Result.push_back(getOperands(Values, I)); + break; + } + + return Result; +} + +/// Returns the opcode of Values or ~0 if they do not all agree. +static Optional getOpcode(ArrayRef Values) { + unsigned Opcode = cast(Values[0])->getOpcode(); + if (any_of(Values, [Opcode](VPValue *V) { + return cast(V)->getOpcode() != Opcode; + })) + return None; + return {Opcode}; +} + +/// Returns true if A and B access sequential memory if they are loads or +/// stores or if they have identical opcodes otherwise. +static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, + VPInterleavedAccessInfo &IAI) { + if (A->getOpcode() != B->getOpcode()) + return false; + + if (A->getOpcode() != Instruction::Load && + A->getOpcode() != Instruction::Store) + return true; + auto *GA = IAI.getInterleaveGroup(A); + auto *GB = IAI.getInterleaveGroup(B); + + return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); +} + +/// Implements getLAScore from Listing 7 in the paper. +/// Traverses and compares operands of V1 and V2 to MaxLevel. +static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, + VPInterleavedAccessInfo &IAI) { + if (!isa(V1) || !isa(V2)) + return 0; + + if (MaxLevel == 0) + return (unsigned)areConsecutiveOrMatch(cast(V1), + cast(V2), IAI); + + unsigned Score = 0; + for (unsigned I = 0, EV1 = cast(V1)->getNumOperands(); I < EV1; ++I) + for (unsigned J = 0, EV2 = cast(V2)->getNumOperands(); J < EV2; ++J) + Score += getLAScore(cast(V1)->getOperand(I), + cast(V2)->getOperand(J), MaxLevel - 1, IAI); + return Score; +} + +std::pair +VPlanSlp::getBest(OpMode Mode, VPValue *Last, + SmallVectorImpl &Candidates, + VPInterleavedAccessInfo &IAI) { + LLVM_DEBUG(dbgs() << " getBest\n"); + VPValue *Best = Candidates[0]; + SmallVector BestCandidates; + + LLVM_DEBUG(dbgs() << " Candidates for " + << *cast(Last)->getUnderlyingInstr() << " "); + for (auto *Candidate : Candidates) { + auto *LastI = cast(Last); + auto *CandidateI = cast(Candidate); + if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { + LLVM_DEBUG(dbgs() << *cast(Candidate)->getUnderlyingInstr() + << " "); + BestCandidates.push_back(Candidate); + } + } + LLVM_DEBUG(dbgs() << "\n"); + + if (BestCandidates.empty()) + return {OpMode::Failed, nullptr}; + + if (BestCandidates.size() == 1) + return {Mode, BestCandidates[0]}; + + if (Mode == OpMode::Opcode) { + unsigned BestScore = 0; + for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { + unsigned PrevScore = ~0u; + bool AllSame = true; + + // FIXME: Avoid visiting the same operands multiple times. + for (auto *Candidate : BestCandidates) { + unsigned Score = getLAScore(Last, Candidate, Depth, IAI); + if (PrevScore == ~0u) + PrevScore = Score; + if (PrevScore != Score) + AllSame = false; + PrevScore = Score; + + if (Score > BestScore) { + BestScore = Score; + Best = Candidate; + } + } + if (!AllSame) + break; + } + } + LLVM_DEBUG(dbgs() << "Found best " + << *cast(Best)->getUnderlyingInstr() + << "\n"); + std::remove(Candidates.begin(), Candidates.end(), Best); + + return {Mode, Best}; +} + +SmallVector VPlanSlp::reorderMultiNodeOps() { + SmallVector FinalOrder; + SmallVector Mode; + FinalOrder.reserve(MultiNodeOps.size()); + Mode.reserve(MultiNodeOps.size()); + + LLVM_DEBUG(dbgs() << "Reordering multinode\n"); + + for (auto &Operands : MultiNodeOps) { + FinalOrder.push_back({Operands.first, {Operands.second[0]}}); + if (cast(Operands.second[0])->getOpcode() == + Instruction::Load) + Mode.push_back(OpMode::Load); + else + Mode.push_back(OpMode::Opcode); + } + + for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { + LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); + SmallVector Candidates; + Candidates.reserve(MultiNodeOps.size()); + LLVM_DEBUG(dbgs() << " Candidates "); + for (auto Ops : MultiNodeOps) { + LLVM_DEBUG( + dbgs() << *cast(Ops.second[Lane])->getUnderlyingInstr() + << " "); + Candidates.push_back(Ops.second[Lane]); + } + LLVM_DEBUG(dbgs() << "\n"); + + for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { + LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); + if (Mode[Op] == OpMode::Failed) + continue; + + VPValue *Last = FinalOrder[Op].second[Lane - 1]; + std::pair Res = + getBest(Mode[Op], Last, Candidates, IAI); + if (Res.second) + FinalOrder[Op].second.push_back(Res.second); + else + // TODO: handle this case + FinalOrder[Op].second.push_back(markFailed()); + } + } + + return FinalOrder; +} + +void VPlanSlp::dumpBundle(ArrayRef Values) { + LLVM_DEBUG(dbgs() << " Ops: "); + for (auto Op : Values) + if (auto *Instr = cast_or_null(Op)->getUnderlyingInstr()) + LLVM_DEBUG(dbgs() << *Instr << " | "); + else + LLVM_DEBUG(dbgs() << " nullptr | "); + LLVM_DEBUG(dbgs() << "\n"); +} + +VPInstruction *VPlanSlp::buildGraph(ArrayRef Values) { + assert(!Values.empty() && "Need some operands!"); + + // If we already visited this instruction bundle, re-use the existing node + auto I = BundleToCombined.find(to_vector<4>(Values)); + if (I != BundleToCombined.end()) { +#ifdef NDEBUG + // Check that the resulting graph is a tree. If we re-use a node, this means + // its values have multiple users. We only allow this, if all users of each + // value are the same instruction. + for (auto *V : Values) { + auto UI = V->user_begin(); + auto *FirstUser = *UI++; + while (UI != V->use_end()) { + assert(*UI == FirstUser && "Currently we only support SLP trees."); + UI++; + } + } +#endif + return I->second; + } + + // Dump inputs + LLVM_DEBUG({ + dbgs() << "buildGraph: "; + dumpBundle(Values); + }); + + if (!areVectorizable(Values)) + return markFailed(); + + assert(getOpcode(Values) && "Opcodes for all values must match"); + unsigned ValuesOpcode = getOpcode(Values).getValue(); + + SmallVector CombinedOperands; + if (areCommutative(Values)) { + bool MultiNodeRoot = !MultiNodeActive; + MultiNodeActive = true; + for (auto &Operands : getOperands(Values)) { + LLVM_DEBUG({ + dbgs() << " Visiting Commutative"; + dumpBundle(Operands); + }); + + auto OperandsOpcode = getOpcode(Operands); + if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { + LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); + CombinedOperands.push_back(buildGraph(Operands)); + } else { + LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); + // Create dummy VPInstruction, which will we replace later by the + // re-ordered operand. + VPInstruction *Op = new VPInstruction(0, {}); + CombinedOperands.push_back(Op); + MultiNodeOps.emplace_back(Op, Operands); + } + } + + if (MultiNodeRoot) { + LLVM_DEBUG(dbgs() << "Reorder \n"); + MultiNodeActive = false; + + auto FinalOrder = reorderMultiNodeOps(); + + MultiNodeOps.clear(); + for (auto &Ops : FinalOrder) { + VPInstruction *NewOp = buildGraph(Ops.second); + Ops.first->replaceAllUsesWith(NewOp); + for (unsigned i = 0; i < CombinedOperands.size(); i++) + if (CombinedOperands[i] == Ops.first) + CombinedOperands[i] = NewOp; + delete Ops.first; + Ops.first = NewOp; + } + LLVM_DEBUG(dbgs() << "Found final order\n"); + } + } else { + LLVM_DEBUG(dbgs() << " NonCommuntative\n"); + if (ValuesOpcode == Instruction::Load) + for (VPValue *V : Values) + CombinedOperands.push_back(cast(V)->getOperand(0)); + else + for (auto &Operands : getOperands(Values)) + CombinedOperands.push_back(buildGraph(Operands)); + } + + unsigned Opcode; + switch (ValuesOpcode) { + case Instruction::Load: + Opcode = VPInstruction::SLPLoad; + break; + case Instruction::Store: + Opcode = VPInstruction::SLPStore; + break; + default: + Opcode = ValuesOpcode; + break; + } + + if (!CompletelySLP) + return markFailed(); + + assert(CombinedOperands.size() > 0 && "Need more some operands"); + auto *VPI = new VPInstruction(Opcode, CombinedOperands); + VPI->setUnderlyingInstr(cast(Values[0])->getUnderlyingInstr()); + + LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); + cast(Values[0])->print(dbgs()); dbgs() << "\n"); + addCombined(Values, VPI); + return VPI; +} Index: lib/Transforms/Vectorize/VPlanValue.h =================================================================== --- lib/Transforms/Vectorize/VPlanValue.h +++ lib/Transforms/Vectorize/VPlanValue.h @@ -106,6 +106,20 @@ const_user_range users() const { return const_user_range(user_begin(), user_end()); } + + /// Returns true if the value has more than one unique user. + bool hasMoreThanOneUniqueUser() { + if (getNumUsers() == 0) + return false; + + // Check if all users match the first user. + auto Current = std::next(user_begin()); + while (Current != user_end() && *user_begin() == *Current) + Current++; + return Current != user_end(); + } + + void replaceAllUsesWith(VPValue *New); }; typedef DenseMap Value2VPValueTy; @@ -151,6 +165,8 @@ return Operands[N]; } + void setOperand(unsigned i, VPValue *New) { Operands[i] = New; } + typedef SmallVectorImpl::iterator operand_iterator; typedef SmallVectorImpl::const_iterator const_operand_iterator; typedef iterator_range operand_range; Index: unittests/Transforms/Vectorize/CMakeLists.txt =================================================================== --- unittests/Transforms/Vectorize/CMakeLists.txt +++ unittests/Transforms/Vectorize/CMakeLists.txt @@ -10,4 +10,5 @@ VPlanLoopInfoTest.cpp VPlanTest.cpp VPlanHCFGTest.cpp + VPlanSlpTest.cpp ) Index: unittests/Transforms/Vectorize/VPlanSlpTest.cpp =================================================================== --- /dev/null +++ unittests/Transforms/Vectorize/VPlanSlpTest.cpp @@ -0,0 +1,784 @@ +//===- llvm/unittest/Transforms/Vectorize/VPlanSlpTest.cpp ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "../lib/Transforms/Vectorize/VPlan.h" +#include "../lib/Transforms/Vectorize/VPlanHCFGBuilder.h" +#include "../lib/Transforms/Vectorize/VPlanHCFGTransforms.h" +#include "VPlanTestBase.h" +#include "llvm/Analysis/VectorUtils.h" +#include "gtest/gtest.h" + +namespace llvm { +namespace { + +class VPlanSlpTest : public VPlanTestBase { +protected: + TargetLibraryInfoImpl TLII; + TargetLibraryInfo TLI; + DataLayout DL; + + std::unique_ptr AC; + std::unique_ptr SE; + std::unique_ptr AARes; + std::unique_ptr BasicAA; + std::unique_ptr LAI; + std::unique_ptr PSE; + std::unique_ptr IAI; + + VPlanSlpTest() + : TLII(), TLI(TLII), + DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-" + "f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:" + "16:32:64-S128") {} + + VPInterleavedAccessInfo getInterleavedAccessInfo(Function &F, Loop *L, + VPlan &Plan) { + AC.reset(new AssumptionCache(F)); + SE.reset(new ScalarEvolution(F, TLI, *AC, *DT, *LI)); + BasicAA.reset(new BasicAAResult(DL, F, TLI, *AC, &*DT, &*LI)); + AARes.reset(new AAResults(TLI)); + AARes->addAAResult(*BasicAA); + PSE.reset(new PredicatedScalarEvolution(*SE, *L)); + LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, &*AARes, &*DT, &*LI)); + IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI)); + IAI->analyzeInterleaving(); + return {Plan, *IAI}; + } +}; + +TEST_F(VPlanSlpTest, testSlpSimple_2) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 12)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 14)); + + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + EXPECT_EQ(64u, Slp.getWidestBundleBits()); + EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode()); + + auto *CombinedAdd = cast(CombinedStore->getOperand(0)); + EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode()); + + auto *CombinedLoadA = cast(CombinedAdd->getOperand(0)); + auto *CombinedLoadB = cast(CombinedAdd->getOperand(1)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode()); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode()); +} + +TEST_F(VPlanSlpTest, testSlpSimple_3) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr %struct.Test, %struct.Test* %A, i64 " + " %indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 12)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 14)); + + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + EXPECT_EQ(64u, Slp.getWidestBundleBits()); + EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode()); + + auto *CombinedAdd = cast(CombinedStore->getOperand(0)); + EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode()); + + auto *CombinedLoadA = cast(CombinedAdd->getOperand(0)); + auto *CombinedLoadB = cast(CombinedAdd->getOperand(1)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode()); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode()); + + VPInstruction *GetA = cast(&*std::next(Body->begin(), 1)); + VPInstruction *GetB = cast(&*std::next(Body->begin(), 3)); + EXPECT_EQ(GetA, CombinedLoadA->getOperand(0)); + EXPECT_EQ(GetB, CombinedLoadB->getOperand(0)); +} + +TEST_F(VPlanSlpTest, testSlpReuse_1) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %add0 = add nsw i32 %vA0, %vA0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %add1 = add nsw i32 %vA1, %vA1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 8)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 10)); + + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + EXPECT_EQ(64u, Slp.getWidestBundleBits()); + EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode()); + + auto *CombinedAdd = cast(CombinedStore->getOperand(0)); + EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode()); + + auto *CombinedLoadA = cast(CombinedAdd->getOperand(0)); + EXPECT_EQ(CombinedLoadA, CombinedAdd->getOperand(1)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode()); +} + +TEST_F(VPlanSlpTest, testSlpReuse_2) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define i32 @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %add0 = add nsw i32 %vA0, %vA0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %add1 = add nsw i32 %vA1, %vA1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret i32 %vA1\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 5)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 10)); + + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + Slp.buildGraph(StoreRoot); + EXPECT_FALSE(Slp.isCompletelySLP()); +} + +static void checkReorderExample(VPInstruction *Store1, VPInstruction *Store2, + VPBasicBlock *Body, + VPInterleavedAccessInfo &&IAI) { + VPlanSlp Slp(IAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + + EXPECT_TRUE(Slp.isCompletelySLP()); + EXPECT_EQ(CombinedStore->getOpcode(), VPInstruction::SLPStore); + + VPInstruction *CombinedAdd = + cast(CombinedStore->getOperand(0)); + EXPECT_EQ(CombinedAdd->getOpcode(), Instruction::Add); + + VPInstruction *CombinedMulAB = + cast(CombinedAdd->getOperand(0)); + VPInstruction *CombinedMulCD = + cast(CombinedAdd->getOperand(1)); + EXPECT_EQ(CombinedMulAB->getOpcode(), Instruction::Mul); + + VPInstruction *CombinedLoadA = + cast(CombinedMulAB->getOperand(0)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode()); + VPInstruction *LoadvA0 = cast(&*std::next(Body->begin(), 2)); + VPInstruction *LoadvA1 = cast(&*std::next(Body->begin(), 12)); + EXPECT_EQ(LoadvA0->getOperand(0), CombinedLoadA->getOperand(0)); + EXPECT_EQ(LoadvA1->getOperand(0), CombinedLoadA->getOperand(1)); + + VPInstruction *CombinedLoadB = + cast(CombinedMulAB->getOperand(1)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode()); + VPInstruction *LoadvB0 = cast(&*std::next(Body->begin(), 4)); + VPInstruction *LoadvB1 = cast(&*std::next(Body->begin(), 14)); + EXPECT_EQ(LoadvB0->getOperand(0), CombinedLoadB->getOperand(0)); + EXPECT_EQ(LoadvB1->getOperand(0), CombinedLoadB->getOperand(1)); + + EXPECT_EQ(CombinedMulCD->getOpcode(), Instruction::Mul); + + VPInstruction *CombinedLoadC = + cast(CombinedMulCD->getOperand(0)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadC->getOpcode()); + VPInstruction *LoadvC0 = cast(&*std::next(Body->begin(), 7)); + VPInstruction *LoadvC1 = cast(&*std::next(Body->begin(), 17)); + EXPECT_EQ(LoadvC0->getOperand(0), CombinedLoadC->getOperand(0)); + EXPECT_EQ(LoadvC1->getOperand(0), CombinedLoadC->getOperand(1)); + + VPInstruction *CombinedLoadD = + cast(CombinedMulCD->getOperand(1)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadD->getOpcode()); + VPInstruction *LoadvD0 = cast(&*std::next(Body->begin(), 9)); + VPInstruction *LoadvD1 = cast(&*std::next(Body->begin(), 19)); + EXPECT_EQ(LoadvD0->getOperand(0), CombinedLoadD->getOperand(0)); + EXPECT_EQ(LoadvD1->getOperand(0), CombinedLoadD->getOperand(1)); +} + +TEST_F(VPlanSlpTest, testSlpReorder_1) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA0, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vA1, %vB1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vC1, %vD1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + checkReorderExample( + Store1, Store2, Body, + getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan)); +} + +TEST_F(VPlanSlpTest, testSlpReorder_2) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA0, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vB1, %vA1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vD1, %vC1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + checkReorderExample( + Store1, Store2, Body, + getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan)); +} + +TEST_F(VPlanSlpTest, testSlpReorder_3) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA1, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vB1, %vA0\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vD1, %vC1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + + // FIXME Need to select better first value for lane0. + EXPECT_FALSE(Slp.isCompletelySLP()); +} + +TEST_F(VPlanSlpTest, testSlpReorder_4) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA0, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vA1, %vB1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vC1, %vD1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + checkReorderExample( + Store1, Store2, Body, + getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan)); +} + +// Make sure we do not combine instructions with operands in different BBs. +TEST_F(VPlanSlpTest, testInstrsInDifferentBBs) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " br label %bb2\n" + "bb2:\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + VPBasicBlock *BB2 = Body->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(BB2->begin(), 3)); + VPInstruction *Store2 = cast(&*std::next(BB2->begin(), 5)); + + VPlanSlp Slp(VPIAI, *BB2); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + EXPECT_EQ(0u, Slp.getWidestBundleBits()); + EXPECT_EQ(0, CombinedStore->getOpcode()); +} + +// Make sure we do not combine instructions with operands in different BBs. +TEST_F(VPlanSlpTest, testInstrsInDifferentBBs2) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " br label %bb2\n" + "bb2:\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + VPBasicBlock *BB2 = Body->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(BB2->begin(), 1)); + VPInstruction *Store2 = cast(&*std::next(BB2->begin(), 3)); + + VPlanSlp Slp(VPIAI, *BB2); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + EXPECT_EQ(0u, Slp.getWidestBundleBits()); + EXPECT_EQ(0, CombinedStore->getOpcode()); +} + + +} // namespace +} // namespace llvm