Index: include/llvm/Analysis/VectorUtils.h =================================================================== --- include/llvm/Analysis/VectorUtils.h +++ include/llvm/Analysis/VectorUtils.h @@ -271,7 +271,7 @@ /// Get the index for the given member. Unlike the key in the member /// map, the index starts from 0. - unsigned getIndex(InstTy *Instr) const { + unsigned getIndex(const InstTy *Instr) const { for (auto I : Members) { if (I.second == Instr) return I.first - SmallestKey; @@ -349,7 +349,8 @@ /// Get the interleave group that \p Instr belongs to. /// /// \returns nullptr if doesn't have such group. - InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { + InterleaveGroup * + getInterleaveGroup(const Instruction *Instr) const { if (InterleaveGroupMap.count(Instr)) return InterleaveGroupMap.find(Instr)->second; return nullptr; Index: lib/Transforms/Vectorize/CMakeLists.txt =================================================================== --- lib/Transforms/Vectorize/CMakeLists.txt +++ lib/Transforms/Vectorize/CMakeLists.txt @@ -7,6 +7,7 @@ VPlan.cpp VPlanHCFGBuilder.cpp VPlanHCFGTransforms.cpp + VPlanSLP.cpp VPlanVerifier.cpp ADDITIONAL_HEADER_DIRS Index: lib/Transforms/Vectorize/VPlan.h =================================================================== --- lib/Transforms/Vectorize/VPlan.h +++ lib/Transforms/Vectorize/VPlan.h @@ -586,7 +586,11 @@ public: /// VPlan opcodes, extending LLVM IR with idiomatics instructions. - enum { Not = Instruction::OtherOpsEnd + 1 }; + enum { + Not = Instruction::OtherOpsEnd + 1, + CombinedLoad, + CombinedStore, + }; private: typedef unsigned char OpcodeTy; @@ -609,12 +613,18 @@ return V->getVPValueID() == VPValue::VPInstructionSC; } + VPInstruction *clone() const { + SmallVector Operands(operands()); + return new VPInstruction(Opcode, Operands); + } + /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *R) { return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC; } unsigned getOpcode() const { return Opcode; } + void setOpcode(unsigned Op) { Opcode = Op; } /// Generate the instruction. /// TODO: We currently execute only per-part unless a specific instance is @@ -626,6 +636,12 @@ /// Print the VPInstruction. void print(raw_ostream &O) const; + + Instruction *getUnderlyingInstr() { + return cast_or_null(getUnderlyingValue()); + } + + void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } }; /// VPWidenRecipe is a recipe for producing a copy of vector type for each @@ -1363,6 +1379,76 @@ } }; +/// Class that maps (parts) of an existing VPlan to trees of combined +/// VPInstructions. +class VPSlp { +private: + /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of + /// sorted SmallVectors of const SCEV*. + struct BundleDenseMapInfo { + static SmallVector getEmptyKey() { + return {reinterpret_cast(-1)}; + } + + static SmallVector getTombstoneKey() { + return {reinterpret_cast(-2)}; + } + + static unsigned getHashValue(const SmallVector &V) { + return static_cast(hash_combine_range(V.begin(), V.end())); + } + + static bool isEqual(const SmallVector &LHS, + const SmallVector &RHS) { + return LHS == RHS; + } + }; + + /// Mapping of values in the original VPlan to a combined VPInstruction. + DenseMap, VPInstruction *, BundleDenseMapInfo> + BundleToCombined; + + VPInterleavedAccessInfo &IAI; + + /// Indicates whether we managed to combine all visited instructions or not. + bool CompletelySLP = true; + + /// Width of the widest combined bundle in bits. + unsigned WidestBundleBits = 0; + + using MultiNodeOpTy = + typename std::pair>; + + // Input operand bundles for the current multi node. Each multi node operand + // bundle contains values not matching the multi node's opcode. They will + // be reordered in reorderMultiNodeOps, once we completed building a + // multi node. + SmallVector MultiNodeOps; + + bool MultiNodeActive = false; + + bool areVectorizable(SmallVectorImpl &Operands) const; + + VPInstruction *addCombined(SmallVector &Operands, + VPInstruction *New); + + VPInstruction *markFailed(); + + SmallVector reorderMultiNodeOps(); + +public: + VPSlp(VPInterleavedAccessInfo &IAI) : IAI(IAI) {} + + /// Tries to build an SLP tree rooted at \p Operands and returns a + /// VPInstruction combining \p Operands, if they can be combined. + VPInstruction *buildGraph(SmallVector &Operands); + + /// Return the width of the widest combined bundle in bits. + unsigned getWidestBundleBits() const { return WidestBundleBits; } + + /// Return true if all visited instruction can be combined. + bool isCompletelySLP() const { return CompletelySLP; } +}; } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H Index: lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- lib/Transforms/Vectorize/VPlan.cpp +++ lib/Transforms/Vectorize/VPlan.cpp @@ -273,6 +273,12 @@ case VPInstruction::Not: O << "not"; break; + case VPInstruction::CombinedLoad: + O << "combined load"; + break; + case VPInstruction::CombinedStore: + O << "combined store"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -577,6 +583,13 @@ O << "\\l\""; } +void VPValue::replaceAllUsesWith(VPValue *New) { + for (VPUser *User : users()) + for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) + if (User->getOperand(I) == this) + User->setOperand(I, New); +} + VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI) { DenseMap *, InterleaveGroup *> @@ -589,7 +602,7 @@ for (auto I = VPBB->begin(), E = VPBB->end(); I != E; I++) { assert(isa(&*I) && "Can only handle VPInstructions"); VPInstruction *VPInst = cast(&*I); - Instruction *Inst = cast(VPInst->getUnderlyingValue()); + const Instruction *Inst = cast(VPInst->getUnderlyingValue()); auto *IG = IAI.getInterleaveGroup(Inst); if (!IG) continue; Index: lib/Transforms/Vectorize/VPlanSLP.cpp =================================================================== --- /dev/null +++ lib/Transforms/Vectorize/VPlanSLP.cpp @@ -0,0 +1,435 @@ +//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// This file implements SLP analysis based on VPlan. The analysis is based on +/// the ideas described in +/// +/// Look-ahead SLP: auto-vectorization in the presence of commutative +/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, +/// Luís F. W. Góes +/// +//===----------------------------------------------------------------------===// + +#include "VPlan.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "vplan-slp" + +static const Instruction *getUnderlyingInstr(VPValue *V) { + return cast(V)->getUnderlyingInstr(); +} + +VPInstruction *VPSlp::markFailed() { + // FIXME: Currently this is used to signal we hit instructions we cannot + // trivially SLP'ize. + CompletelySLP = false; + return new VPInstruction(0, {}); +} + +static unsigned getNumUniqueUsers(VPValue *V) { + SmallPtrSet Users(V->user_begin(), V->user_end()); + return Users.size(); +} + +VPInstruction *VPSlp::addCombined(SmallVector &Operands, + VPInstruction *New) { + if (all_of(Operands, [](VPValue *V) { + return cast(V)->getUnderlyingInstr(); + })) { + unsigned BundleSize = 0; + for (VPValue *V : Operands) { + Type *T = cast(V)->getUnderlyingInstr()->getType(); + assert(!T->isVectorTy() && "Only scalar types supported for now"); + BundleSize += T->getScalarSizeInBits(); + } + WidestBundleBits = std::max(WidestBundleBits, BundleSize); + } + + assert(BundleToCombined.find(Operands) == BundleToCombined.end() && + "Already created a combined instruction for the operand bundle"); + BundleToCombined[Operands] = New; + return New; +} + +bool VPSlp::areVectorizable(SmallVectorImpl &Operands) const { + // Currently we only support VPInstructions. + if (!all_of(Operands, [](VPValue *Op) { + return Op && isa(Op) && + cast(Op)->getUnderlyingInstr(); + })) { + dbgs() << "VPSLP: not all operands are VPInstructions\n"; + return false; + } + + // Check if opcodes and type width agree for all instructions in the bundle. + // FIXME: Differing widths/opcodes can be handled by inserting additional + // instructions. + // FIXME: Deal with non-primitive types. + const Instruction *OriginalInstr = getUnderlyingInstr(Operands[0]); + unsigned Opcode = OriginalInstr->getOpcode(); + unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); + if (!all_of(Operands, [Opcode, Width](VPValue *Op) { + const Instruction *I = getUnderlyingInstr(Op); + return I->getOpcode() == Opcode && + I->getType()->getPrimitiveSizeInBits() == Width; + })) { + dbgs() << "VPSLP: Opcodes do not agree \n"; + return false; + } + + if (!all_of(Operands, + [](VPValue *Op) { return getNumUniqueUsers(Op) <= 1; })) { + dbgs() << "VPSLP: Some operands have multiple users.\n"; + return false; + } + + // For loads, check that there are no stores between them. + // FIXME: we only have to forbid stores that could interfere with any of the + // loads in the bundle + // FIXME: we do not have OrderedVPBBs, so for now we just have to scan a BB + // until we find the first load in the bundle and then check for + // stores until we find the last load. + if (Opcode == Instruction::Load) { + VPBasicBlock *Parent = cast(Operands[0])->getParent(); + if (any_of(Operands, [Parent](VPValue *V) { + return cast(V)->getParent() != Parent; + })) { + LLVM_DEBUG(dbgs() << "VPSLP: not all in same BB\n"); + return false; + } + unsigned LoadsSeen = 0; + for (auto &I : *Parent) { + auto *VPI = cast(&I); + if (VPI->getOpcode() == Instruction::Load && + std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) + LoadsSeen++; + + if (LoadsSeen == Operands.size()) + break; + + if (LoadsSeen > 0 && VPI->getOpcode() == Instruction::Store) { + LLVM_DEBUG(dbgs() << "VPSLP: Store between load\n"); + return false; + } + } + } + return true; +} + +static SmallVector getOperands(SmallVectorImpl &Values, + unsigned OperandIndex) { + SmallVector Operands; + for (VPValue *V : Values) { + auto *U = cast(V); + Operands.push_back(U->getOperand(OperandIndex)); + } + return Operands; +} + +static bool areCommutative(SmallVectorImpl &Values) { + return Instruction::isCommutative( + cast(Values[0])->getOpcode()); +} + +static SmallVector, 4> +getOperands(SmallVectorImpl &Values) { + SmallVector, 4> Result; + auto *VPI = cast(Values[0]); + + switch (VPI->getOpcode()) { + case Instruction::Load: + llvm_unreachable("Loads terminate a tree, no need to get operands"); + case Instruction::Store: + Result.push_back(getOperands(Values, 0)); + break; + default: { + for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) + Result.push_back(getOperands(Values, I)); + break; + } + } + + return Result; +} + +/// Returns the opcode of Values or ~0 if they do not all agree. +static Optional getOpcode(SmallVectorImpl &Values) { + unsigned Opcode = cast(Values[0])->getOpcode(); + if (any_of(Values, [Opcode](VPValue *V) { + return cast(V)->getOpcode() != Opcode; + })) + return {}; + return {Opcode}; +} + +enum class OpMode { Failed, Load, Opcode }; + +/// Returns true if A and B access sequential memory if they are loads or +/// stores or if they have identical opcodes otherwise. +static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, + VPInterleavedAccessInfo &IAI) { + if (A->getOpcode() != B->getOpcode()) + return false; + + if (A->getOpcode() != Instruction::Load && + A->getOpcode() != Instruction::Store && + B->getOpcode() != Instruction::Load && + B->getOpcode() != Instruction::Store) + return true; + auto *GA = IAI.getInterleaveGroup(A); + auto *GB = IAI.getInterleaveGroup(B); + + return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); +} + +/// Implements getLAScore from Listing 7 in the paper. +/// Traverses and compares operands of V1 and V2 to MaxLevel. +static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, + VPInterleavedAccessInfo &IAI) { + if (!isa(V1) || !isa(V2)) + return 0; + + if (MaxLevel == 0) + return (unsigned)areConsecutiveOrMatch(cast(V1), + cast(V2), IAI); + + unsigned Score = 0; + for (unsigned i = 0, eV1 = cast(V1)->getNumOperands(); i < eV1; i++) + for (unsigned j = 0, eV2 = cast(V2)->getNumOperands(); j < eV2; j++) + Score += getLAScore(cast(V1)->getOperand(i), + cast(V2)->getOperand(j), MaxLevel - 1, IAI); + return Score; +} + +static std::pair +getBest(OpMode Mode, VPValue *Last, SmallVectorImpl &Candidates, + VPInterleavedAccessInfo &IAI) { + LLVM_DEBUG(dbgs() << " getBest\n"); + VPValue *Best = Candidates[0]; + SmallVector BestCandidates; + + LLVM_DEBUG(dbgs() << " Candidates for " + << *cast(Last)->getUnderlyingInstr() << " "); + for (auto *Candidate : Candidates) { + auto *LastI = cast(Last); + auto *CandidateI = cast(Candidate); + if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { + LLVM_DEBUG(dbgs() << *cast(Candidate)->getUnderlyingInstr() + << " "); + BestCandidates.push_back(Candidate); + } + } + LLVM_DEBUG(dbgs() << "\n"); + + if (BestCandidates.empty()) + return {OpMode::Failed, nullptr}; + + if (BestCandidates.size() == 1) + return {Mode, BestCandidates[0]}; + + if (Mode == OpMode::Opcode) { + unsigned BestScore = 0; + for (unsigned Depth = 1; Depth < 5; Depth++) { + unsigned PrevScore = ~0u; + bool AllSame = true; + + // FIXME: Avoid visiting the same operands multiple times. + for (auto *Candidate : BestCandidates) { + unsigned Score = getLAScore(Last, Candidate, Depth, IAI); + if (PrevScore == ~0u) + PrevScore = Score; + if (PrevScore != Score) + AllSame = false; + PrevScore = Score; + + if (Score > BestScore) { + BestScore = Score; + Best = Candidate; + } + } + if (!AllSame) + break; + } + } + LLVM_DEBUG(dbgs() << "Found best " + << *cast(Best)->getUnderlyingInstr() + << "\n"); + std::remove(Candidates.begin(), Candidates.end(), Best); + + return {Mode, Best}; +} + +SmallVector VPSlp::reorderMultiNodeOps() { + SmallVector FinalOrder; + + SmallVector Mode; + LLVM_DEBUG(dbgs() << "Reordering multinode\n"); + + for (auto &Operands : MultiNodeOps) { + FinalOrder.push_back({Operands.first, {Operands.second[0]}}); + if (cast(Operands.second[0])->getOpcode() == + Instruction::Load) + Mode.push_back(OpMode::Load); + else + Mode.push_back(OpMode::Opcode); + } + + for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { + LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); + SmallVector Candidates; + LLVM_DEBUG(dbgs() << " Candidates "); + for (auto Ops : MultiNodeOps) { + LLVM_DEBUG( + dbgs() << *cast(Ops.second[Lane])->getUnderlyingInstr() + << " "); + Candidates.push_back(Ops.second[Lane]); + } + LLVM_DEBUG(dbgs() << "\n"); + + for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { + LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); + if (Mode[Op] == OpMode::Failed) + continue; + + VPValue *Last = FinalOrder[Op].second[Lane - 1]; + std::pair Res = + getBest(Mode[Op], Last, Candidates, IAI); + if (Res.second) + FinalOrder[Op].second.push_back(Res.second); + else + // TODO: handle this case + FinalOrder[Op].second.push_back(markFailed()); + } + } + + return FinalOrder; +} + +static void dumpBundle(ArrayRef Values) { + LLVM_DEBUG(dbgs() << " Ops: "); + for (auto Op : Values) + if (Op && getUnderlyingInstr(Op)) + LLVM_DEBUG(dbgs() << *getUnderlyingInstr(Op) << " | "); + else + LLVM_DEBUG(dbgs() << " nullptr | "); + LLVM_DEBUG(dbgs() << "\n"); +} + +VPInstruction *VPSlp::buildGraph(SmallVector &Values) { + assert(!Values.empty() && "Need some operands!"); + // If we already visited this instruction bundle, re-use the existing node + auto I = BundleToCombined.find(Values); + if (I != BundleToCombined.end()) + return I->second; + + // Dump inputs + LLVM_DEBUG(dbgs() << "buildGraph"); + dumpBundle(Values); + if (!areVectorizable(Values)) + return markFailed(); + + assert(getOpcode(Values) && "Opcodes for all values must match"); + unsigned ValuesOpcode = getOpcode(Values).getValue(); + + SmallVector CombinedOperands; + if (areCommutative(Values)) { + bool MultiNodeRoot = !MultiNodeActive; + MultiNodeActive = true; + for (auto &Operands : getOperands(Values)) { + LLVM_DEBUG(dbgs() << " Visiting Commutative"); + dumpBundle(Operands); + auto OperandsOpcode = getOpcode(Operands); + if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { + LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); + CombinedOperands.push_back(buildGraph(Operands)); + } else { + LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); + // Create dummy VPInstruction, which will we replace later by the + // re-ordered operand. + VPInstruction *Op = new VPInstruction(0, {}); + CombinedOperands.push_back(Op); + MultiNodeOps.emplace_back(Op, Operands); + } + } + + if (MultiNodeRoot) { + LLVM_DEBUG(dbgs() << "Reorder \n"); + MultiNodeActive = false; + + auto FinalOrder = reorderMultiNodeOps(); + + MultiNodeOps.clear(); + for (auto &Ops : FinalOrder) { + VPInstruction *NewOp = buildGraph(Ops.second); + Ops.first->replaceAllUsesWith(NewOp); + for (unsigned i = 0; i < CombinedOperands.size(); i++) + if (CombinedOperands[i] == Ops.first) + CombinedOperands[i] = NewOp; + delete Ops.first; + Ops.first = NewOp; + } + LLVM_DEBUG(dbgs() << "Found final order\n"); + } + } else { + LLVM_DEBUG(dbgs() << " NonCommuntative\n"); + if (ValuesOpcode == Instruction::Load) + for (VPValue *V : Values) + CombinedOperands.push_back(cast(V)->getOperand(0)); + else + for (auto &Operands : getOperands(Values)) + CombinedOperands.push_back(buildGraph(Operands)); + if (ValuesOpcode == Instruction::Store) + CombinedOperands.push_back(cast(Values[0])->getOperand(1)); + } + + unsigned Opcode; + switch (ValuesOpcode) { + case Instruction::Load: + Opcode = VPInstruction::CombinedLoad; + break; + case Instruction::Store: + Opcode = VPInstruction::CombinedStore; + break; + default: + Opcode = ValuesOpcode; + break; + } + + assert(CombinedOperands.size() > 0 && "Need more some operands"); + auto *VPI = new VPInstruction(Opcode, CombinedOperands); + VPI->setUnderlyingInstr(cast(Values[0])->getUnderlyingInstr()); + + LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); + cast(Values[0])->print(dbgs()); dbgs() << "\n"); + return addCombined(Values, VPI); +} Index: lib/Transforms/Vectorize/VPlanValue.h =================================================================== --- lib/Transforms/Vectorize/VPlanValue.h +++ lib/Transforms/Vectorize/VPlanValue.h @@ -104,6 +104,8 @@ const_user_range users() const { return const_user_range(user_begin(), user_end()); } + + void replaceAllUsesWith(VPValue *New); }; typedef DenseMap Value2VPValueTy; @@ -149,6 +151,8 @@ return Operands[N]; } + void setOperand(unsigned i, VPValue *New) { Operands[i] = New; } + typedef SmallVectorImpl::iterator operand_iterator; typedef SmallVectorImpl::const_iterator const_operand_iterator; typedef iterator_range operand_range; Index: unittests/Transforms/Vectorize/CMakeLists.txt =================================================================== --- unittests/Transforms/Vectorize/CMakeLists.txt +++ unittests/Transforms/Vectorize/CMakeLists.txt @@ -8,4 +8,5 @@ add_llvm_unittest(VectorizeTests VPlanTest.cpp VPlanHCFGTest.cpp + VPlanSlpTest.cpp ) Index: unittests/Transforms/Vectorize/VPlanSlpTest.cpp =================================================================== --- /dev/null +++ unittests/Transforms/Vectorize/VPlanSlpTest.cpp @@ -0,0 +1,655 @@ +//===- llvm/unittest/Transforms/Vectorize/VPlanSlpTest.cpp ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "../lib/Transforms/Vectorize/VPlan.h" +#include "../lib/Transforms/Vectorize/VPlanHCFGBuilder.h" +#include "../lib/Transforms/Vectorize/VPlanHCFGTransforms.h" +#include "VPlanTestBase.h" +#include "llvm/Analysis/VectorUtils.h" +#include "gtest/gtest.h" + +namespace llvm { +namespace { + +class VPlanSlpTest : public VPlanTestBase { +protected: + TargetLibraryInfoImpl TLII; + TargetLibraryInfo TLI; + DataLayout DL; + + std::unique_ptr AC; + std::unique_ptr SE; + std::unique_ptr AARes; + std::unique_ptr BasicAA; + std::unique_ptr LAI; + std::unique_ptr PSE; + std::unique_ptr IAI; + + VPlanSlpTest() + : TLII(), TLI(TLII), + DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-" + "f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:" + "16:32:64-S128") {} + + VPInterleavedAccessInfo getInterleavedAccessInfo(Function &F, Loop *L, + VPlan &Plan) { + AC.reset(new AssumptionCache(F)); + SE.reset(new ScalarEvolution(F, TLI, *AC, *DT, *LI)); + BasicAA.reset(new BasicAAResult(DL, F, TLI, *AC, &*DT, &*LI)); + AARes.reset(new AAResults(TLI)); + AARes->addAAResult(*BasicAA); + PSE.reset(new PredicatedScalarEvolution(*SE, *L)); + LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, &*AARes, &*DT, &*LI)); + IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI)); + IAI->analyzeInterleaving(); + return {Plan, *IAI}; + } +}; + +TEST_F(VPlanSlpTest, testSlpSimple_3) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr %struct.Test, %struct.Test* %A, i64 " + " %indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 12)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 14)); + + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + VPSlp Slp(VPIAI); + + SmallVector StoreRoot = {Store1, Store2}; + + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + EXPECT_EQ(64u, Slp.getWidestBundleBits()); + EXPECT_EQ(VPInstruction::CombinedStore, CombinedStore->getOpcode()); + + auto *CombinedAdd = cast(CombinedStore->getOperand(0)); + EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode()); + + auto *CombinedLoadA = cast(CombinedAdd->getOperand(0)); + auto *CombinedLoadB = cast(CombinedAdd->getOperand(1)); + EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadA->getOpcode()); + EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadB->getOpcode()); + + VPInstruction *GetA = cast(&*std::next(Body->begin(), 1)); + VPInstruction *GetB = cast(&*std::next(Body->begin(), 3)); + EXPECT_EQ(GetA, CombinedLoadA->getOperand(0)); + EXPECT_EQ(GetB, CombinedLoadB->getOperand(0)); +} + +TEST_F(VPlanSlpTest, testSlpSimple_2) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + VPSlp Slp(VPIAI); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 12)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 14)); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + EXPECT_EQ(64u, Slp.getWidestBundleBits()); + EXPECT_EQ(VPInstruction::CombinedStore, CombinedStore->getOpcode()); + + auto *CombinedAdd = cast(CombinedStore->getOperand(0)); + EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode()); + + auto *CombinedLoadA = cast(CombinedAdd->getOperand(0)); + auto *CombinedLoadB = cast(CombinedAdd->getOperand(1)); + EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadA->getOpcode()); + EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadB->getOpcode()); +} + +TEST_F(VPlanSlpTest, testSlpReuse_1) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %add0 = add nsw i32 %vA0, %vA0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %add1 = add nsw i32 %vA1, %vA1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + VPSlp Slp(VPIAI); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 8)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 10)); + + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + EXPECT_EQ(64u, Slp.getWidestBundleBits()); + EXPECT_EQ(VPInstruction::CombinedStore, CombinedStore->getOpcode()); + + auto *CombinedAdd = cast(CombinedStore->getOperand(0)); + EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode()); + + auto *CombinedLoadA = cast(CombinedAdd->getOperand(0)); + EXPECT_EQ(CombinedLoadA, CombinedAdd->getOperand(1)); + EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadA->getOpcode()); +} + +TEST_F(VPlanSlpTest, testSlpReuse_2) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define i32 @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %add0 = add nsw i32 %vA0, %vA0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %add1 = add nsw i32 %vA1, %vA1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret i32 %vA1\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + VPSlp Slp(VPIAI); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 5)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 10)); + + SmallVector StoreRoot = {Store1, Store2}; + Slp.buildGraph(StoreRoot); + EXPECT_FALSE(Slp.isCompletelySLP()); +} + +static void checkReorderExample(VPInstruction *Store1, VPInstruction *Store2, + VPBasicBlock *Body, + VPInterleavedAccessInfo &&IAI) { + VPSlp Slp(IAI); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + + EXPECT_TRUE(Slp.isCompletelySLP()); + EXPECT_EQ(CombinedStore->getOpcode(), VPInstruction::CombinedStore); + + VPInstruction *CombinedAdd = + cast(CombinedStore->getOperand(0)); + EXPECT_EQ(CombinedAdd->getOpcode(), Instruction::Add); + + VPInstruction *CombinedMulAB = + cast(CombinedAdd->getOperand(0)); + VPInstruction *CombinedMulCD = + cast(CombinedAdd->getOperand(1)); + EXPECT_EQ(CombinedMulAB->getOpcode(), Instruction::Mul); + + VPInstruction *CombinedLoadA = + cast(CombinedMulAB->getOperand(0)); + EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadA->getOpcode()); + VPInstruction *LoadvA0 = cast(&*std::next(Body->begin(), 2)); + VPInstruction *LoadvA1 = cast(&*std::next(Body->begin(), 12)); + EXPECT_EQ(LoadvA0->getOperand(0), CombinedLoadA->getOperand(0)); + EXPECT_EQ(LoadvA1->getOperand(0), CombinedLoadA->getOperand(1)); + + VPInstruction *CombinedLoadB = + cast(CombinedMulAB->getOperand(1)); + EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadB->getOpcode()); + VPInstruction *LoadvB0 = cast(&*std::next(Body->begin(), 4)); + VPInstruction *LoadvB1 = cast(&*std::next(Body->begin(), 14)); + EXPECT_EQ(LoadvB0->getOperand(0), CombinedLoadB->getOperand(0)); + EXPECT_EQ(LoadvB1->getOperand(0), CombinedLoadB->getOperand(1)); + + EXPECT_EQ(CombinedMulCD->getOpcode(), Instruction::Mul); + + VPInstruction *CombinedLoadC = + cast(CombinedMulCD->getOperand(0)); + EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadC->getOpcode()); + VPInstruction *LoadvC0 = cast(&*std::next(Body->begin(), 7)); + VPInstruction *LoadvC1 = cast(&*std::next(Body->begin(), 17)); + EXPECT_EQ(LoadvC0->getOperand(0), CombinedLoadC->getOperand(0)); + EXPECT_EQ(LoadvC1->getOperand(0), CombinedLoadC->getOperand(1)); + + VPInstruction *CombinedLoadD = + cast(CombinedMulCD->getOperand(1)); + EXPECT_EQ(VPInstruction::CombinedLoad, CombinedLoadD->getOpcode()); + VPInstruction *LoadvD0 = cast(&*std::next(Body->begin(), 9)); + VPInstruction *LoadvD1 = cast(&*std::next(Body->begin(), 19)); + EXPECT_EQ(LoadvD0->getOperand(0), CombinedLoadD->getOperand(0)); + EXPECT_EQ(LoadvD1->getOperand(0), CombinedLoadD->getOperand(1)); +} + +TEST_F(VPlanSlpTest, testSlpReorder_1) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA0, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vA1, %vB1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vC1, %vD1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + checkReorderExample( + Store1, Store2, Body, + getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan)); +} + +TEST_F(VPlanSlpTest, testSlpReorder_2) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA0, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vB1, %vA1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vD1, %vC1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + checkReorderExample( + Store1, Store2, Body, + getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan)); +} + +TEST_F(VPlanSlpTest, testSlpReorder_3) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA1, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vB1, %vA0\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vD1, %vC1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + SmallVector StoreRoot = {Store1, Store2}; + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + VPSlp Slp(VPIAI); + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + + // FIXME Need to select better first value for lane0. + EXPECT_FALSE(Slp.isCompletelySLP()); +} + +TEST_F(VPlanSlpTest, testSlpReorder_4) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA0, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vA1, %vB1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vC1, %vD1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + checkReorderExample( + Store1, Store2, Body, + getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan)); +} + +} // namespace +} // namespace llvm