Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -153,6 +153,16 @@ "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores")); +/// Perform operand reordering across chains of commutative operations (which we +/// refer to as SuperNodes). +static cl::opt EnableSuperNode( + "slp-enable-supernode", cl::init(true), cl::Hidden, + cl::desc("Enable SuperNodes and operand reordering across them")); + +static cl::opt MaxSuperNodeSize( + "slp-max-supernode-size", cl::init(8), cl::Hidden, + cl::desc("Limit the size of the SuperNode to this many TreeEntries")); + static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -493,6 +503,7 @@ class BoUpSLP { struct TreeEntry; struct ScheduleData; + struct BlockScheduling; public: using ValueList = SmallVector; @@ -506,8 +517,9 @@ TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE) - : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), - DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { + : CurrSuperNode(*DL, *Se, *this), F(Func), SE(Se), TTI(Tti), TLI(TLi), + AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), + Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); // Use the vector register size specified by the target unless overridden // by a command-line option. @@ -635,6 +647,14 @@ TreeEntry *UserTE = nullptr; /// The operand index of the use. unsigned EdgeIdx = UINT_MAX; + /// The APOs across each lane. + SmallVector APOs; + /// Initialize the APOs. + void initAPOs(unsigned NumLanes) { + // Initialize the APOs in 'UserTreeIdx'. + APOs.resize(NumLanes); + std::fill(APOs.begin(), APOs.end(), false); + } #ifndef NDEBUG friend inline raw_ostream &operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI) { @@ -644,7 +664,10 @@ /// Debug print. void dump(raw_ostream &OS) const { OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") - << " EdgeIdx:" << EdgeIdx << "}"; + << " EdgeIdx:" << EdgeIdx << " APOs:"; + for (auto APO : APOs) + OS << APO << ","; + OS << "}"; } LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } #endif @@ -1084,9 +1107,6 @@ } } - /// \returns the number of operands. - unsigned getNumOperands() const { return OpsVec.size(); } - /// \returns the number of lanes. unsigned getNumLanes() const { return OpsVec[0].size(); } @@ -1098,9 +1118,6 @@ /// \returns true if the data structure is empty. bool empty() const { return OpsVec.empty(); } - /// Clears the data. - void clear() { OpsVec.clear(); } - /// \Returns true if there are enough operands identical to \p Op to fill /// the whole vector. /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. @@ -1128,6 +1145,8 @@ } public: + VLOperands(const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) + : DL(DL), SE(SE), R(R){}; /// Initialize with all the operands of the instruction vector \p RootVL. VLOperands(ArrayRef RootVL, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) @@ -1136,6 +1155,32 @@ appendOperandsOfVL(RootVL); } + /// Append the \p OpIdx 'th operands of \p VL for all lanes. + void appendOperands(ArrayRef OpVL, const EdgeInfo &EI, + ArrayRef APOVec) { + unsigned NumLanes = OpVL.size(); + assert((OpsVec.empty() || NumLanes == getNumLanes()) && + "Must keep same num of lanes"); + unsigned SNOpIdx = OpsVec.size(); + OpsVec.resize(SNOpIdx + 1); + OpsVec[SNOpIdx].resize(NumLanes); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) + OpsVec[SNOpIdx][Lane] = {OpVL[Lane], APOVec[Lane], false}; + } + + /// \returns the number of operands. + unsigned getNumOperands() const { return OpsVec.size(); } + + /// Clears the data. + void clear() { OpsVec.clear(); } + + // Since operand reordering is performed on groups of commutative + // operations or alternating sequences (e.g., +, -), we can safely + // tell the inverse operations by checking commutativity. + static bool computeAPO(unsigned OpIdx, bool UserAPO, bool IsUserInverse) { + return (OpIdx == 0) ? UserAPO : (IsUserInverse) ? !UserAPO : UserAPO; + } + /// \Returns a value vector with the operands across all lanes for the /// opearnd at \p OpIdx. ValueList getVL(unsigned OpIdx) const { @@ -1147,6 +1192,15 @@ return OpVL; } + /// \Returns the APOs across all lanes for \p OpIdx. + SmallVector getAPOVec(unsigned OpIdx) const { + unsigned NumLanes = getNumLanes(); + SmallVector APOVec(NumLanes); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) + APOVec[Lane] = getData(OpIdx, Lane).APO; + return APOVec; + } + // Performs operand reordering for 2 or more operands. // The original operands are in OrigOps[OpIdx][Lane]. // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. @@ -1308,6 +1362,11 @@ /// \returns the cost of the vectorizable entry. int getEntryCost(TreeEntry *E); + /// Recursively build the supernode. + void buildSuperNode_rec(ArrayRef VL, TreeEntry *TE, unsigned Depth, + const EdgeInfo &UserTreeIdx, + BoUpSLP::BlockScheduling &BS); + /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth, const EdgeInfo &EI); @@ -1410,7 +1469,7 @@ void setOperand(unsigned OpIdx, ArrayRef OpVL) { if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); - assert(Operands[OpIdx].size() == 0 && "Already resized?"); + Operands[OpIdx].clear(); Operands[OpIdx].resize(Scalars.size()); for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) Operands[OpIdx][Lane] = OpVL[Lane]; @@ -1489,6 +1548,220 @@ #endif }; + /// This class represents a SuperNode of TreeEntries. + /// A supernode is a subgraph of the SLP graph containining chains of nodes + /// of legal opcodes. For now, the SuperNode is a tree of Add/Subs. + /// + /// For example a SuperNode may look like this: + /// \verbatim + /// op0 op1 + /// +-------\-/-+ + /// | TE2 |op2 op3 op4 + /// | \ /------\-/--+ + /// | TE1 TE3 | + /// | \ / | + /// |SuperNode TE0 | TE#: TreeEntry node + /// +-----------------------+ op#: Immediate predecessor of node + /// \endverbatim + /// + /// The SuperNode is based on ideas described in: + /// Super-Node SLP: Optimized Vectorization for Code Sequences Containing + /// Operators and Their Inverse Elements, CGO 2019 by Vasileios Porpodas, + /// Rodrigo C. O. Rocha, Evgueni Brevnov, Luís F. W. Góes, Timothy Mattson. + class SuperNode { + /// The TreeEntries that are part of this SuperNode. + SmallSet TreeEntries; + + /// The Root of the SuperNode. + TreeEntry *RootTE = nullptr; + + /// All instructions in the SuperNode should have this opcode. + unsigned Opcode = 0; + + /// Holds the operands of the SuperNode that will be reordered. + VLOperands Operands; + + /// The edges that correspond to the operands of this SuperNode. + SmallVector Edges; + + /// All nodes in the SuperNode must have the same number of lanes. + unsigned NumLanes = 0; + + /// \Returns the inverse opcode of \p Opc, e.g. the inverse of ADD is SUB. + static unsigned getInverseOpcode(unsigned Opc) { + switch (Opc) { + case Instruction::Add: + return Instruction::Sub; + case Instruction::Sub: + return Instruction::Add; + case Instruction::FAdd: + return Instruction::FSub; + case Instruction::FSub: + return Instruction::FAdd; + default: + return 0; + } + } + + /// \Returns true if \p VL has compatible opcodes to this SuperNode. + bool hasCompatibleOpcodesWithSuperNode(ArrayRef VL) const { + return std::all_of(VL.begin(), VL.end(), [this](Value *V) { + return isa(V) && + (cast(V)->getOpcode() == Opcode || + cast(V)->getOpcode() == getInverseOpcode(Opcode)); + }); + } + + /// \returns true if all scalars in TreeEntry \p TEIdx have a single use. + static bool hasSingleUse(ArrayRef VL) { + return std::all_of(VL.begin(), VL.end(), + [](Value *V) { return V->hasOneUse(); }); + } + + /// \returns true if the binary operator \p BO allows reassociation. + static bool canReassociate(BinaryOperator *BO) { + // If fmath, then check the fast-math flags. + if (auto FPI = dyn_cast(BO)) + return FPI->getFastMathFlags().allowReassoc(); + // Else if an overflowing operator, check for the NSW flag. + else if (auto OBO = dyn_cast(BO)) + return OBO->hasNoSignedWrap(); + // Else it is legal to reassociate. + return true; + } + + /// \returns true if all entries in \p VL allow operand reassociation. + static bool allowReassociation(ArrayRef VL) { + for (Value *V : VL) { + assert(isa(V) && "Expected binary operators in VL"); + BinaryOperator *I = cast(V); + if (!canReassociate(I)) + return false; + } + return true; + } + + /// \returns true if \p VL is compatible with this SuperNode. + /// That is if \p VL : + /// (i) has the same opcode as the rest of the SuperNode, + /// (ii) has a single use, + /// (iii) has the same number of lanes as the rest of SuperNode, and + /// (iv) contains instructions that allow reassociation. + /// (v) is in the same BB as the root of the SuperNode. + bool isCompatibleVL(ArrayRef VL) const { + BasicBlock *BB0 = cast(RootTE->Scalars[0])->getParent(); + bool IsInSameBB = (!isa(VL[0])) + ? true + : cast(VL[0])->getParent() == BB0; + return hasCompatibleOpcodesWithSuperNode(VL) && hasSingleUse(VL) && + VL.size() == NumLanes && allowReassociation(VL) && IsInSameBB; + } + + public: + SuperNode(const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) + : Operands(DL, SE, R) {} + + /// \Returns true if \p VL contains legal opcodes for a SuperNode. + static bool canInit(ArrayRef VL) { + assert(isa(VL[0]) && "Expected Instruction"); + unsigned Opcode0 = cast(VL[0])->getOpcode(); + if (getInverseOpcode(Opcode0) == 0) + return false; + return std::all_of(std::next(VL.begin()), VL.end(), [&](Value *V) { + assert(isa(V) && "Expected Instruction"); + unsigned Opcode = cast(V)->getOpcode(); + return Opcode == Opcode0 || Opcode == getInverseOpcode(Opcode0); + }); + } + + /// Start forming a new SuperNode. Set \p RootIdx it as the root. + /// \returns true on success. + bool init(TreeEntry *RootTreeEntry) { + const auto &VL = RootTreeEntry->Scalars; + Opcode = cast(VL[0])->getOpcode(); + RootTE = RootTreeEntry; + NumLanes = VL.size(); + return true; + } + + /// \returns true if \p VL can become a SuperNode entry. + /// This checks if we have not reached the size limit and if the \p VL is + /// compatible. + bool canExtendTowards(ArrayRef VL) const { + return size() < MaxSuperNodeSize && isCompatibleVL(VL); + } + + /// Check if the TreeEntry \p TEIdx is compatible with this SuperNode, and + void appendEntry(TreeEntry *TE) { + assert((empty() || isCompatibleVL(TE->Scalars)) && "Missing check."); + TreeEntries.insert(TE); + } + + /// Append \p OpVL to the vector containing the operands of this SuperNode. + void appendOperands(ArrayRef UserVL, const EdgeInfo &EI, + ArrayRef APOVec) { + Operands.appendOperands(UserVL, EI, APOVec); + Edges.push_back(EI); + assert(Operands.getNumOperands() == Edges.size() && "out of sync"); + } + + /// \returns true if there are no entires in this SuperNode. + bool empty() const { return TreeEntries.empty(); } + + /// \returns the number of TreeEntries in the SuperNode. + size_t size() const { return TreeEntries.size(); } + + /// Clears all data. + void clear() { + TreeEntries.clear(); + RootTE = nullptr; + Opcode = 0; + Operands.clear(); + Edges.clear(); + NumLanes = 0; + } + + /// \returns the index of the root TreeEntry. + TreeEntry *getRoot() const { + return RootTE; + } + + /// \returns the operands of the SuperNode. + const VLOperands &getOperands() const { return Operands; } + + /// \returns the number of operands of the SuperNode. + unsigned getNumOperands() const { return Operands.getNumOperands(); } + + /// \returns the edge that corresponds to \p OpIdx 'th operand. + const EdgeInfo &getEdge(unsigned OpIdx) const { return Edges[OpIdx]; } + + /// Reorder operands across the whole SuperNode to improve vectorization. + void reorderOperands(BoUpSLP *R) { + Operands.reorder(); + // We update the operands of the TreeEntries. + for (int Idx = 0, E = Edges.size(); Idx != E; ++Idx) + Edges[Idx].UserTE->setOperand(Edges[Idx].EdgeIdx, Operands.getVL(Idx)); + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Debug printers. + void dump(raw_ostream &OS) const { + OS << "TreeEntries: "; + for (TreeEntry *TE : TreeEntries) + OS << TE->Idx << ", "; + OS << "\n"; + + for (TreeEntry *TE : TreeEntries) + TE->dump(); + OS << "-------------\n"; + OS << "Root TE: " << RootTE->Idx << "\n"; + OS << "Operands:\n"; + Operands.print(OS); + } + LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } +#endif + }; + /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, const EdgeInfo &UserTreeIdx, @@ -1532,6 +1805,9 @@ /// Holds all of the tree entries. TreeEntry::VecTreeTy VectorizableTree; + /// Holds the tree entries that are in the SuperNode being constructed. + SuperNode CurrSuperNode; + #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dumpVectorizableTree() const { @@ -1933,6 +2209,9 @@ tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, const InstructionsState &S); + /// Schedule all ready bundles. + void scheduleReady(); + /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef VL, Value *OpValue); @@ -1957,6 +2236,9 @@ /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); + /// \returns true if \p VL is already scheduled. + bool alreadyScheduled(Value *V) { return getScheduleData(V); } + BasicBlock *BB; /// Simple memory allocation for ScheduleData. @@ -2252,6 +2534,95 @@ } } +// Building the CurrSuperNode changes the way we do the recursion. +// Originally, we used to do a DFS towards the definitions. +// When building the SuperNode we change the recursion towards nodes that +// may be added to the SuperNode. After that, we continue the recursion +// from the operands of the SuperNode. For example: +// +// L2 L3 +// \ / +// L1 B(+) +// \ / +// A(+) +// | +// S Visiting order +// -------------- +// Originally: S, A+, L1, B+, L2, L3 +// w/SuperNode: S, A+, B+, L1, L2, L3 +// |______| +// SuperNode +// +void BoUpSLP::buildSuperNode_rec(ArrayRef VL, TreeEntry *TE, + unsigned Depth, const EdgeInfo &UserTreeIdx, + BoUpSLP::BlockScheduling &BS) { + // If we are building a new supernode, TE is the root entry. + if (CurrSuperNode.empty()) + CurrSuperNode.init(TE); + // Add a new entry to the supernode under construction. + CurrSuperNode.appendEntry(TE); + + // Reorder the operands of VL. + VLOperands Ops(VL, *DL, *SE, *this); + Ops.reorder(); + + // Update TE to reflect the reordered operands. This is needed for the + // scheduler. + for (unsigned OpIdx = 0, NumOperands = Ops.getNumOperands(); + OpIdx != NumOperands; ++OpIdx) { + const auto &OpVL = Ops.getVL(OpIdx); + TE->setOperand(OpIdx, OpVL); + } + // We are now ready to continue the recursion towards the operands. + for (unsigned OpIdx = 0, NumOperands = Ops.getNumOperands(); + OpIdx != NumOperands; ++OpIdx) { + const auto &OpVL = Ops.getVL(OpIdx); + if (CurrSuperNode.canExtendTowards(OpVL)) + // If the operands are compatible with the supernode, continue the + // recursion towards them. 'OpVL' will be part of the supernode. + buildTree_rec(OpVL, Depth + 1, {TE, OpIdx}); + else { + // Else, stop the recursion. These operands will now become + // operands of the supernode. + CurrSuperNode.appendOperands(OpVL, {TE, OpIdx}, Ops.getAPOVec(OpIdx)); + } + } + // Return true if we have a non-empty supernode and wea are back at the root. + bool BuiltSuperNode = + !CurrSuperNode.empty() && TE->Idx == CurrSuperNode.getRoot()->Idx; + if (BuiltSuperNode) { + // This is rather ugly, but I don't see a cleaner way. + // The problem is that even after calling trySchedule(Bundle), + // 'Bundle' is not actually scheduled, only its successors are. This + // results in some of the nodes of a supernode (the ones closer to the + // root) being scheduled, while others not. Scheduled instructions + // have their predecessors dependence edges updated. This causes a + // problem after reordering, because some of these dependence counters + // may be updated twice. With scheduleReady() we force-schedule all + // the ready bundles (that is all of the bundles of the supernode) + // before we do the reordering, in order to avoid this double + // increment of the dependence counters. + // Another alternative is to unschedule and reschedule the nodes. + BS.scheduleReady(); + + // Reordering across the whole superonde. + CurrSuperNode.reorderOperands(this); + + // Create a temporary copy of the CurrSuperNode for looping + // through its operands after it has been cleared. + SuperNode TmpSuperNode = CurrSuperNode; + // We must clear the CurrSuperNode at this point because the + // recursion should continue without any active supernode. + CurrSuperNode.clear(); + // Resume the recursion towards the operands of the supernode. + for (unsigned OpIdx = 0, NumOperands = TmpSuperNode.getNumOperands(); + OpIdx != NumOperands; ++OpIdx) { + const auto OpVL = TmpSuperNode.getOperands().getVL(OpIdx); + buildTree_rec(OpVL, Depth + 1, TmpSuperNode.getEdge(OpIdx)); + } + } +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -2661,26 +3032,33 @@ newTreeEntry(VL, true, UserTreeIdx, Bundle, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); - // Sort operands of the instructions so that each side is more likely to - // have the same opcode. - if (isa(VL0) && VL0->isCommutative()) { - ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); - TE->setOperand(0, Left); - TE->setOperand(1, Right); - buildTree_rec(Left, Depth + 1, {TE, 0}); - buildTree_rec(Right, Depth + 1, {TE, 1}); - return; - } + if (EnableSuperNode && + // We are either in progress, or we can create a new one. + (!CurrSuperNode.empty() || CurrSuperNode.canInit(VL))) + buildSuperNode_rec(VL, TE, Depth, UserTreeIdx, BS); + // The default recursion. + else { + // Sort operands of the instructions so that each side is more likely to + // have the same opcode. + if (isa(VL0) && VL0->isCommutative()) { + ValueList Left, Right; + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + TE->setOperand(0, Left); + TE->setOperand(1, Right); + buildTree_rec(Left, Depth + 1, {TE, 0}); + buildTree_rec(Right, Depth + 1, {TE, 1}); + return; + } - TE->setOperandsInOrder(); - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + TE->setOperandsInOrder(); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (Value *j : VL) + Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, {TE, i}); + buildTree_rec(Operands, Depth + 1, {TE, i}); + } } return; } @@ -2835,31 +3213,40 @@ BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, nullptr, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); + assert(CurrSuperNode.empty() && "This should be unreachable."); return; } auto *TE = newTreeEntry(VL, true, UserTreeIdx, Bundle, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); - // Reorder operands if reordering would enable vectorization. - if (isa(VL0)) { - ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); - TE->setOperand(0, Left); - TE->setOperand(1, Right); - buildTree_rec(Left, Depth + 1, {TE, 0}); - buildTree_rec(Right, Depth + 1, {TE, 1}); - return; - } + if (EnableSuperNode && + // We are either in progress, or we can create a new one. + (!CurrSuperNode.empty() || CurrSuperNode.canInit(VL))) + buildSuperNode_rec(VL, TE, Depth, UserTreeIdx, BS); + // The default recursion. + else { + // Reorder operands if reordering would enable vectorization. + if (isa(VL0)) { + ValueList Left, Right; + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + TE->setOperand(0, Left); + TE->setOperand(1, Right); + buildTree_rec(Left, Depth + 1, {TE, 0}); + buildTree_rec(Right, Depth + 1, {TE, 1}); + return; + } - TE->setOperandsInOrder(); - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + TE->setOperandsInOrder(); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (Value *j : VL) + Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, {TE, i}); + buildTree_rec(Operands, Depth + 1, {TE, i}); + } + return; } return; } @@ -4567,6 +4954,15 @@ return {true, Bundle}; } +void BoUpSLP::BlockScheduling::scheduleReady() { + while (!ReadyInsts.empty()) { + ScheduleData *pickedSD = ReadyInsts.back(); + ReadyInsts.pop_back(); + if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) + schedule(pickedSD, ReadyInsts); + } +} + void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, Value *OpValue) { if (isa(OpValue)) Index: test/Transforms/SLPVectorizer/X86/lookahead.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/lookahead.ll +++ test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -237,28 +237,27 @@ ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 -; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 -; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B2]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP8]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 -; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/supernode.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/supernode.ll @@ -0,0 +1,446 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -verify | FileCheck %s -check-prefix=ENABLED +; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -verify -slp-enable-supernode=false | FileCheck %s -check-prefix=DISABLED +; +; Without supernode operand reordering, this does not get fully vectorized. +; S[0] = (A[0] + B[0]) + C[0] +; S[1] = (B[1] + C[1]) + A[1] +define void @test_supernode_add(double* %Aarray, double* %Barray, double *%Carray, double *%Sarray) { +; ENABLED-LABEL: @test_supernode_add( +; ENABLED-NEXT: entry: +; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 +; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 +; ENABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 +; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>* +; ENABLED-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP5]] +; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; ENABLED-NEXT: ret void +; +; DISABLED-LABEL: @test_supernode_add( +; DISABLED-NEXT: entry: +; DISABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 +; DISABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 +; DISABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 +; DISABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 +; DISABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 +; DISABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 +; DISABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; DISABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; DISABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 +; DISABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 +; DISABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 +; DISABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 +; DISABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]] +; DISABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 +; DISABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 +; DISABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] +; DISABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; DISABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; DISABLED-NEXT: ret void +; +entry: + %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0 + %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1 + %idxB0 = getelementptr inbounds double, double* %Barray, i64 0 + %idxB1 = getelementptr inbounds double, double* %Barray, i64 1 + %idxC0 = getelementptr inbounds double, double* %Carray, i64 0 + %idxC1 = getelementptr inbounds double, double* %Carray, i64 1 + %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0 + %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1 + + %A0 = load double, double *%idxA0, align 8 + %A1 = load double, double *%idxA1, align 8 + + %B0 = load double, double *%idxB0, align 8 + %B1 = load double, double *%idxB1, align 8 + + %C0 = load double, double *%idxC0, align 8 + %C1 = load double, double *%idxC1, align 8 + + %addA0B0 = fadd fast double %A0, %B0 + %addB1C1 = fadd fast double %B1, %C1 + %add0 = fadd fast double %addA0B0, %C0 + %add1 = fadd fast double %addB1C1, %A1 + store double %add0, double *%idxS0, align 8 + store double %add1, double *%idxS1, align 8 + ret void +} + + +; Without supernode operand reordering, this does not get fully vectorized. +; S[0] = (A[0] - B[0]) + C[0] +; S[1] = (C[1] - B[1]) + A[1] +define void @test_supernode_addsub(double* %Aarray, double* %Barray, double *%Carray, double *%Sarray) { +; ENABLED-LABEL: @test_supernode_addsub( +; ENABLED-NEXT: entry: +; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 +; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 +; ENABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 +; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>* +; ENABLED-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; ENABLED-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP5]] +; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; ENABLED-NEXT: ret void +; +; DISABLED-LABEL: @test_supernode_addsub( +; DISABLED-NEXT: entry: +; DISABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 +; DISABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 +; DISABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 +; DISABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 +; DISABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 +; DISABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 +; DISABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; DISABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; DISABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 +; DISABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 +; DISABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 +; DISABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 +; DISABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] +; DISABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 +; DISABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 +; DISABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] +; DISABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; DISABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; DISABLED-NEXT: ret void +; +entry: + %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0 + %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1 + %idxB0 = getelementptr inbounds double, double* %Barray, i64 0 + %idxB1 = getelementptr inbounds double, double* %Barray, i64 1 + %idxC0 = getelementptr inbounds double, double* %Carray, i64 0 + %idxC1 = getelementptr inbounds double, double* %Carray, i64 1 + %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0 + %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1 + + %A0 = load double, double *%idxA0, align 8 + %A1 = load double, double *%idxA1, align 8 + + %B0 = load double, double *%idxB0, align 8 + %B1 = load double, double *%idxB1, align 8 + + %C0 = load double, double *%idxC0, align 8 + %C1 = load double, double *%idxC1, align 8 + + %subA0B0 = fsub fast double %A0, %B0 + %subC1B1 = fsub fast double %C1, %B1 + %add0 = fadd fast double %subA0B0, %C0 + %add1 = fadd fast double %subC1B1, %A1 + store double %add0, double *%idxS0, align 8 + store double %add1, double *%idxS1, align 8 + ret void +} + +; Without supernode operand reordering, this does not get fully vectorized. +; This checks that the super-node works with alternate sequences. +; +; S[0] = (A[0] - B[0]) - C[0] +; S[1] = (B[1] + C[1]) + A[1] +define void @test_supernode_addsub_alt(double* %Aarray, double* %Barray, double *%Carray, double *%Sarray) { +; ENABLED-LABEL: @test_supernode_addsub_alt( +; ENABLED-NEXT: entry: +; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 +; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 +; ENABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 +; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>* +; ENABLED-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; ENABLED-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> +; ENABLED-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP5]] +; ENABLED-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP5]] +; ENABLED-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x i32> +; ENABLED-NEXT: [[TMP12:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 +; ENABLED-NEXT: ret void +; +; DISABLED-LABEL: @test_supernode_addsub_alt( +; DISABLED-NEXT: entry: +; DISABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 +; DISABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 +; DISABLED-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 +; DISABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 +; DISABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 +; DISABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 +; DISABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 +; DISABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 +; DISABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 +; DISABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 +; DISABLED-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]] +; DISABLED-NEXT: [[ADDB1C1:%.*]] = fadd fast double [[B1]], [[C1]] +; DISABLED-NEXT: [[SUB0:%.*]] = fsub fast double [[SUBA0B0]], [[C0]] +; DISABLED-NEXT: [[ADD1:%.*]] = fadd fast double [[ADDB1C1]], [[A1]] +; DISABLED-NEXT: store double [[SUB0]], double* [[IDXS0]], align 8 +; DISABLED-NEXT: store double [[ADD1]], double* [[IDXS1]], align 8 +; DISABLED-NEXT: ret void +; +entry: + %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0 + %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1 + %idxB0 = getelementptr inbounds double, double* %Barray, i64 0 + %idxB1 = getelementptr inbounds double, double* %Barray, i64 1 + %idxC0 = getelementptr inbounds double, double* %Carray, i64 0 + %idxC1 = getelementptr inbounds double, double* %Carray, i64 1 + %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0 + %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1 + + %A0 = load double, double *%idxA0, align 8 + %A1 = load double, double *%idxA1, align 8 + + %B0 = load double, double *%idxB0, align 8 + %B1 = load double, double *%idxB1, align 8 + + %C0 = load double, double *%idxC0, align 8 + %C1 = load double, double *%idxC1, align 8 + + %subA0B0 = fsub fast double %A0, %B0 + %addB1C1 = fadd fast double %B1, %C1 + %sub0 = fsub fast double %subA0B0, %C0 + %add1 = fadd fast double %addB1C1, %A1 + store double %sub0, double *%idxS0, align 8 + store double %add1, double *%idxS1, align 8 + ret void +} + +; This checks that vectorizeTree() works correctly with the supernode +; and does not generate uses before defs. +; If all of the operands of the supernode are vectorizable, then the scheduler +; will fix their position in the program. If not, then the scheduler may not +; touch them, leading to uses before defs. +; +; A0 = ... +; C = ... +; t1 = A0 + C +; B0 = ... +; t2 = t1 + B0 +; A1 = ... +; B1 = ... +; t3 = A1 + B1 +; D = ... +; t4 = t3 + D +; +; +; A0 C A1 B1 A0 C A1 D A0:1 C,D +; \ / \ / Reorder \ / \ / Bundles \ / +; t1 + B0 t3 + D -------> t1 + B0 t3 + B1 ------> t1:3 + B0:1 +; |/ |/ |/ |/ |/ +; t2 + t4 + t2 + t4 + t2:4 + +; +; After reordering, 'D' conceptually becomes an operand of t3: +; t3 = A1 + D +; But D is defined *after* its use. +; +define void @supernode_scheduling(double* %Aarray, double* %Barray, double *%Carray, double *%Darray, double *%Sarray) { +; ENABLED-LABEL: @supernode_scheduling( +; ENABLED-NEXT: entry: +; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 +; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 +; ENABLED-NEXT: [[IDXC:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXD:%.*]] = getelementptr inbounds double, double* [[DARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 +; ENABLED-NEXT: [[C:%.*]] = load double, double* [[IDXC]], align 8 +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[D:%.*]] = load double, double* [[IDXD]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0 +; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[D]], i32 1 +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP5]] +; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP3]] +; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; ENABLED-NEXT: ret void +; +; DISABLED-LABEL: @supernode_scheduling( +; DISABLED-NEXT: entry: +; DISABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 +; DISABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 +; DISABLED-NEXT: [[IDXC:%.*]] = getelementptr inbounds double, double* [[CARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXD:%.*]] = getelementptr inbounds double, double* [[DARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 +; DISABLED-NEXT: [[C:%.*]] = load double, double* [[IDXC]], align 8 +; DISABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 +; DISABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; DISABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; DISABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 +; DISABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0 +; DISABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B1]], i32 1 +; DISABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; DISABLED-NEXT: [[D:%.*]] = load double, double* [[IDXD]], align 8 +; DISABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0 +; DISABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[D]], i32 1 +; DISABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] +; DISABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; DISABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; DISABLED-NEXT: ret void +; +entry: + %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0 + %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1 + %idxB0 = getelementptr inbounds double, double* %Barray, i64 0 + %idxB1 = getelementptr inbounds double, double* %Barray, i64 1 + %idxC = getelementptr inbounds double, double* %Carray, i64 0 + %idxD = getelementptr inbounds double, double* %Darray, i64 0 + %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0 + %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1 + + + %A0 = load double, double *%idxA0, align 8 + %C = load double, double *%idxC, align 8 + %t1 = fadd fast double %A0, %C + %B0 = load double, double *%idxB0, align 8 + %t2 = fadd fast double %t1, %B0 + %A1 = load double, double *%idxA1, align 8 + %B1 = load double, double *%idxB1, align 8 + %t3 = fadd fast double %A1, %B1 + %D = load double, double *%idxD, align 8 + %t4 = fadd fast double %t3, %D + + store double %t2, double *%idxS0, align 8 + store double %t4, double *%idxS1, align 8 + ret void +} + + +; The SLP scheduler has trouble moving instructions across blocks. +; Even though we can build a SuperNode for this example, we should not because the scheduler +; cannot handle the cross-block instruction motion that is required once the operands of the +; SuperNode are reordered. +; +; bb1: +; A0 = ... +; B1 = ... +; Tmp0 = A0 + 2.0 +; Tmp1 = B1 + 2.0 +; +; bb2: +; A1 = ... +; B0 = ... +; S[0] = Tmp0 + B0 +; S[1] = Tmp1 + A1 +define void @supernode_scheduling_cross_block(double* %Aarray, double* %Barray, double *%Sarray) { +; ENABLED-LABEL: @supernode_scheduling_cross_block( +; ENABLED-NEXT: entry: +; ENABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 +; ENABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 +; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 +; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 +; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 +; ENABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 +; ENABLED-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 +; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[B1]], i32 1 +; ENABLED-NEXT: [[TMP2:%.*]] = fadd fast <2 x double> [[TMP1]], +; ENABLED-NEXT: br label [[BB:%.*]] +; ENABLED: bb: +; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 +; ENABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 +; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0 +; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[A1]], i32 1 +; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]] +; ENABLED-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 +; ENABLED-NEXT: ret void +; +; DISABLED-LABEL: @supernode_scheduling_cross_block( +; DISABLED-NEXT: entry: +; DISABLED-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[AARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[AARRAY]], i64 1 +; DISABLED-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[BARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[BARRAY]], i64 1 +; DISABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 +; DISABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 +; DISABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 +; DISABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 +; DISABLED-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 +; DISABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[B1]], i32 1 +; DISABLED-NEXT: [[TMP2:%.*]] = fadd fast <2 x double> [[TMP1]], +; DISABLED-NEXT: br label [[BB:%.*]] +; DISABLED: bb: +; DISABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 +; DISABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 +; DISABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0 +; DISABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[A1]], i32 1 +; DISABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]] +; DISABLED-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; DISABLED-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 +; DISABLED-NEXT: ret void +; +entry: + %idxA0 = getelementptr inbounds double, double* %Aarray, i64 0 + %idxA1 = getelementptr inbounds double, double* %Aarray, i64 1 + %idxB0 = getelementptr inbounds double, double* %Barray, i64 0 + %idxB1 = getelementptr inbounds double, double* %Barray, i64 1 + %idxS0 = getelementptr inbounds double, double* %Sarray, i64 0 + %idxS1 = getelementptr inbounds double, double* %Sarray, i64 1 + + %A0 = load double, double *%idxA0, align 8 + %B1 = load double, double *%idxB1, align 8 + %Tmp0 = fadd fast double %A0, 2.0 + %Tmp1 = fadd fast double %B1, 2.0 +br label %bb + +bb: + %A1 = load double, double *%idxA1, align 8 + %B0 = load double, double *%idxB0, align 8 + + %Sum0 = fadd fast double %Tmp0, %B0 + %Sum1 = fadd fast double %Tmp1, %A1 + + store double %Sum0, double *%idxS0, align 8 + store double %Sum1, double *%idxS1, align 8 + ret void +}