diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -167,6 +167,16 @@ cl::desc("The maximum number of users to visit while visiting the " "predecessors. This prevents compilation time increase.")); +/// Perform operand reordering across chains of commutative operations (which we +/// refer to as SuperNodes). +static cl::opt EnableSuperNode( + "slp-enable-supernode", cl::init(true), cl::Hidden, + cl::desc("Enable SuperNodes and operand reordering across them")); + +static cl::opt MaxSuperNodeSize( + "slp-max-supernode-size", cl::init(2), cl::Hidden, + cl::desc("Limit the size of the SuperNode to this many TreeEntries")); + static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -510,6 +520,7 @@ class BoUpSLP { struct TreeEntry; struct ScheduleData; + struct BlockScheduling; public: using ValueList = SmallVector; @@ -523,8 +534,9 @@ TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE) - : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), - DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { + : CurrSuperNode(*DL, *Se, *this), F(Func), SE(Se), TTI(Tti), TLI(TLi), + AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), + Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); // Use the vector register size specified by the target unless overridden // by a command-line option. @@ -661,6 +673,14 @@ TreeEntry *UserTE = nullptr; /// The operand index of the use. unsigned EdgeIdx = UINT_MAX; + /// The APOs across each lane. + SmallVector APOs; + /// Initialize the APOs. + void initAPOs(unsigned NumLanes) { + // Initialize the APOs in 'UserTreeIdx'. + APOs.resize(NumLanes); + std::fill(APOs.begin(), APOs.end(), false); + } #ifndef NDEBUG friend inline raw_ostream &operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI) { @@ -670,7 +690,10 @@ /// Debug print. void dump(raw_ostream &OS) const { OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") - << " EdgeIdx:" << EdgeIdx << "}"; + << " EdgeIdx:" << EdgeIdx << " APOs:"; + for (bool APO : APOs) + OS << APO << ","; + OS << "}"; } LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } #endif @@ -1130,9 +1153,6 @@ } } - /// \returns the number of operands. - unsigned getNumOperands() const { return OpsVec.size(); } - /// \returns the number of lanes. unsigned getNumLanes() const { return OpsVec[0].size(); } @@ -1144,9 +1164,6 @@ /// \returns true if the data structure is empty. bool empty() const { return OpsVec.empty(); } - /// Clears the data. - void clear() { OpsVec.clear(); } - /// \Returns true if there are enough operands identical to \p Op to fill /// the whole vector. /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. @@ -1174,6 +1191,9 @@ } public: + VLOperands(const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) + : DL(DL), SE(SE), R(R){}; + /// Initialize with all the operands of the instruction vector \p RootVL. VLOperands(ArrayRef RootVL, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) @@ -1182,6 +1202,32 @@ appendOperandsOfVL(RootVL); } + /// Append the \p OpIdx 'th operands of \p VL for all lanes. + void appendOperands(ArrayRef OpVL, const EdgeInfo &EI, + ArrayRef APOVec) { + unsigned NumLanes = OpVL.size(); + assert((OpsVec.empty() || NumLanes == getNumLanes()) && + "Must keep same num of lanes"); + unsigned SNOpIdx = OpsVec.size(); + OpsVec.resize(SNOpIdx + 1); + OpsVec[SNOpIdx].resize(NumLanes); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) + OpsVec[SNOpIdx][Lane] = {OpVL[Lane], APOVec[Lane], false}; + } + + /// \returns the number of operands. + unsigned getNumOperands() const { return OpsVec.size(); } + + /// Clears the data. + void clear() { OpsVec.clear(); } + + // Since operand reordering is performed on groups of commutative + // operations or alternating sequences (e.g., +, -), we can safely + // tell the inverse operations by checking commutativity. + static bool computeAPO(unsigned OpIdx, bool UserAPO, bool IsUserInverse) { + return (OpIdx == 0) ? UserAPO : (IsUserInverse ? !UserAPO : UserAPO); + } + /// \Returns a value vector with the operands across all lanes for the /// opearnd at \p OpIdx. ValueList getVL(unsigned OpIdx) const { @@ -1193,6 +1239,15 @@ return OpVL; } + /// \Returns the APOs across all lanes for \p OpIdx. + SmallVector getAPOVec(unsigned OpIdx) const { + unsigned NumLanes = getNumLanes(); + SmallVector APOVec(NumLanes); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) + APOVec[Lane] = getData(OpIdx, Lane).APO; + return APOVec; + } + // Performs operand reordering for 2 or more operands. // The original operands are in OrigOps[OpIdx][Lane]. // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. @@ -1362,6 +1417,11 @@ /// \returns the cost of the vectorizable entry. int getEntryCost(TreeEntry *E); + /// Recursively build the supernode. + void buildSuperNode_rec(ArrayRef VL, TreeEntry *TE, unsigned Depth, + const EdgeInfo &UserTreeIdx, + BoUpSLP::BlockScheduling &BS); + /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth, const EdgeInfo &EI); @@ -1467,7 +1527,7 @@ void setOperand(unsigned OpIdx, ArrayRef OpVL) { if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); - assert(Operands[OpIdx].size() == 0 && "Already resized?"); + Operands[OpIdx].clear(); Operands[OpIdx].resize(Scalars.size()); for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) Operands[OpIdx][Lane] = OpVL[Lane]; @@ -1606,6 +1666,220 @@ #endif }; + /// This class represents a SuperNode of TreeEntries. + /// A supernode is a subgraph of the SLP graph containining chains of nodes + /// of legal opcodes. For now, the SuperNode is a tree of Add/Subs. + /// + /// For example a SuperNode may look like this: + /// \verbatim + /// op0 op1 + /// +-------\-/-+ + /// | TE2 |op2 op3 op4 + /// | \ /------\-/--+ + /// | TE1 TE3 | + /// | \ / | + /// |SuperNode TE0 | TE#: TreeEntry node + /// +-----------------------+ op#: Immediate predecessor of node + /// \endverbatim + /// + /// The SuperNode is based on ideas described in: + /// Super-Node SLP: Optimized Vectorization for Code Sequences Containing + /// Operators and Their Inverse Elements, CGO 2019 by Vasileios Porpodas, + /// Rodrigo C. O. Rocha, Evgueni Brevnov, Luís F. W. Góes, Timothy Mattson. + class SuperNode { + /// The TreeEntries that are part of this SuperNode. + SmallSet TreeEntries; + + /// The Root of the SuperNode. + TreeEntry *RootTE = nullptr; + + /// All instructions in the SuperNode should have this opcode. + unsigned Opcode = 0; + + /// Holds the operands of the SuperNode that will be reordered. + VLOperands Operands; + + /// The edges that correspond to the operands of this SuperNode. + SmallVector Edges; + + /// All nodes in the SuperNode must have the same number of lanes. + unsigned NumLanes = 0; + + /// \Returns the inverse opcode of \p Opc, e.g. the inverse of ADD is SUB. + static unsigned getInverseOpcode(unsigned Opc) { + switch (Opc) { + case Instruction::Add: + return Instruction::Sub; + case Instruction::Sub: + return Instruction::Add; + case Instruction::FAdd: + return Instruction::FSub; + case Instruction::FSub: + return Instruction::FAdd; + default: + return 0; + } + } + + /// \Returns true if \p VL has compatible opcodes to this SuperNode. + bool hasCompatibleOpcodesWithSuperNode(ArrayRef VL) const { + return llvm::all_of(VL, [this](Value *V) { + return isa(V) && + (cast(V)->getOpcode() == Opcode || + cast(V)->getOpcode() == getInverseOpcode(Opcode)); + }); + } + + /// \returns true if all scalars in TreeEntry \p TEIdx have a single use. + static bool hasSingleUse(ArrayRef VL) { + return std::all_of(VL.begin(), VL.end(), + [](Value *V) { return V->hasOneUse(); }); + } + + /// \returns true if the binary operator \p BO allows reassociation. + static bool canReassociate(BinaryOperator *BO) { + // If fmath, then check the fast-math flags. + if (auto FPI = dyn_cast(BO)) + return FPI->getFastMathFlags().allowReassoc(); + // Else if an overflowing operator, check for the NSW flag. + else if (auto OBO = dyn_cast(BO)) + return OBO->hasNoSignedWrap(); + // Else it is legal to reassociate. + return true; + } + + /// \returns true if all entries in \p VL allow operand reassociation. + static bool allowReassociation(ArrayRef VL) { + for (Value *V : VL) { + assert(isa(V) && "Expected binary operators in VL"); + BinaryOperator *I = cast(V); + if (!canReassociate(I)) + return false; + } + return true; + } + + /// \returns true if \p VL is compatible with this SuperNode. + /// That is if \p VL : + /// (i) has the same opcode as the rest of the SuperNode, + /// (ii) has a single use, + /// (iii) has the same number of lanes as the rest of SuperNode, and + /// (iv) contains instructions that allow reassociation. + /// (v) is in the same BB as the root of the SuperNode. + bool isCompatibleVL(ArrayRef VL) const { + BasicBlock *BB0 = cast(RootTE->Scalars[0])->getParent(); + bool IsInSameBB = (!isa(VL[0])) + ? true + : cast(VL[0])->getParent() == BB0; + return hasCompatibleOpcodesWithSuperNode(VL) && hasSingleUse(VL) && + VL.size() == NumLanes && allowReassociation(VL) && IsInSameBB; + } + + public: + SuperNode(const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) + : Operands(DL, SE, R) {} + + /// \Returns true if \p VL contains legal opcodes for a SuperNode. + static bool canInit(ArrayRef VL) { + assert(isa(VL[0]) && "Expected Instruction"); + unsigned Opcode0 = cast(VL[0])->getOpcode(); + if (getInverseOpcode(Opcode0) == 0) + return false; + return std::all_of(std::next(VL.begin()), VL.end(), [&](Value *V) { + assert(isa(V) && "Expected Instruction"); + unsigned Opcode = cast(V)->getOpcode(); + return Opcode == Opcode0 || Opcode == getInverseOpcode(Opcode0); + }); + } + + /// Start forming a new SuperNode. Set \p RootIdx it as the root. + /// \returns true on success. + bool init(TreeEntry *RootTreeEntry) { + const ValueList &VL = RootTreeEntry->Scalars; + Opcode = cast(VL[0])->getOpcode(); + RootTE = RootTreeEntry; + NumLanes = VL.size(); + return true; + } + + /// \returns true if \p VL can become a SuperNode entry. + /// This checks if we have not reached the size limit and if the \p VL is + /// compatible. + bool canExtendTowards(ArrayRef VL) const { + return size() < MaxSuperNodeSize && isCompatibleVL(VL); + } + + /// Check if the TreeEntry \p TEIdx is compatible with this SuperNode, and + void appendEntry(TreeEntry *TE) { + assert((empty() || isCompatibleVL(TE->Scalars)) && "Missing check."); + TreeEntries.insert(TE); + } + + /// Append \p OpVL to the vector containing the operands of this SuperNode. + void appendOperands(ArrayRef UserVL, const EdgeInfo &EI, + ArrayRef APOVec) { + Operands.appendOperands(UserVL, EI, APOVec); + Edges.push_back(EI); + assert(Operands.getNumOperands() == Edges.size() && "out of sync"); + } + + /// \returns true if there are no entires in this SuperNode. + bool empty() const { return TreeEntries.empty(); } + + /// \returns the number of TreeEntries in the SuperNode. + size_t size() const { return TreeEntries.size(); } + + /// Clears all data. + void clear() { + TreeEntries.clear(); + RootTE = nullptr; + Opcode = 0; + Operands.clear(); + Edges.clear(); + NumLanes = 0; + } + + /// \returns the index of the root TreeEntry. + TreeEntry *getRoot() const { + return RootTE; + } + + /// \returns the operands of the SuperNode. + const VLOperands &getOperands() const { return Operands; } + + /// \returns the number of operands of the SuperNode. + unsigned getNumOperands() const { return Operands.getNumOperands(); } + + /// \returns the edge that corresponds to \p OpIdx 'th operand. + const EdgeInfo &getEdge(unsigned OpIdx) const { return Edges[OpIdx]; } + + /// Reorder operands across the whole SuperNode to improve vectorization. + void reorderOperands(BoUpSLP *R) { + Operands.reorder(); + // We update the operands of the TreeEntries. + for (int Idx = 0, E = Edges.size(); Idx != E; ++Idx) + Edges[Idx].UserTE->setOperand(Edges[Idx].EdgeIdx, Operands.getVL(Idx)); + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Debug printers. + void dump(raw_ostream &OS) const { + OS << "TreeEntries: "; + for (TreeEntry *TE : TreeEntries) + OS << TE->Idx << ", "; + OS << "\n"; + + for (TreeEntry *TE : TreeEntries) + TE->dump(); + OS << "-------------\n"; + OS << "Root TE: " << RootTE->Idx << "\n"; + OS << "Operands:\n"; + Operands.print(OS); + } + LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } +#endif + }; + /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef VL, Optional Bundle, const InstructionsState &S, @@ -1651,6 +1925,9 @@ /// Holds all of the tree entries. TreeEntry::VecTreeTy VectorizableTree; + /// Holds the tree entries that are in the SuperNode being constructed. + SuperNode CurrSuperNode; + #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dumpVectorizableTree() const { @@ -2051,6 +2328,9 @@ tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, const InstructionsState &S); + /// Schedule all ready bundles. + void scheduleReady(); + /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef VL, Value *OpValue); @@ -2075,6 +2355,9 @@ /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); + /// \returns true if \p VL is already scheduled. + bool alreadyScheduled(Value *V) { return getScheduleData(V); } + BasicBlock *BB; /// Simple memory allocation for ScheduleData. @@ -2394,6 +2677,95 @@ } } +// Building the CurrSuperNode changes the way we do the recursion. +// Originally, we used to do a DFS towards the definitions. +// When building the SuperNode we change the recursion towards nodes that +// may be added to the SuperNode. After that, we continue the recursion +// from the operands of the SuperNode. For example: +// +// L2 L3 +// \ / +// L1 B(+) +// \ / +// A(+) +// | +// S Visiting order +// -------------- +// Originally: S, A+, L1, B+, L2, L3 +// w/SuperNode: S, A+, B+, L1, L2, L3 +// |______| +// SuperNode +// +void BoUpSLP::buildSuperNode_rec(ArrayRef VL, TreeEntry *TE, + unsigned Depth, const EdgeInfo &UserTreeIdx, + BoUpSLP::BlockScheduling &BS) { + // If we are building a new supernode, TE is the root entry. + if (CurrSuperNode.empty()) + CurrSuperNode.init(TE); + // Add a new entry to the supernode under construction. + CurrSuperNode.appendEntry(TE); + + // Reorder the operands of VL. + VLOperands Ops(VL, *DL, *SE, *this); + Ops.reorder(); + + // Update TE to reflect the reordered operands. This is needed for the + // scheduler. + for (unsigned OpIdx = 0, NumOperands = Ops.getNumOperands(); + OpIdx != NumOperands; ++OpIdx) { + const ValueList &OpVL = Ops.getVL(OpIdx); + TE->setOperand(OpIdx, OpVL); + } + // We are now ready to continue the recursion towards the operands. + for (unsigned OpIdx = 0, NumOperands = Ops.getNumOperands(); + OpIdx != NumOperands; ++OpIdx) { + const ValueList &OpVL = Ops.getVL(OpIdx); + if (CurrSuperNode.canExtendTowards(OpVL)) + // If the operands are compatible with the supernode, continue the + // recursion towards them. 'OpVL' will be part of the supernode. + buildTree_rec(OpVL, Depth + 1, {TE, OpIdx}); + else { + // Else, stop the recursion. These operands will now become + // operands of the supernode. + CurrSuperNode.appendOperands(OpVL, {TE, OpIdx}, Ops.getAPOVec(OpIdx)); + } + } + // Return true if we have a non-empty supernode and wea are back at the root. + bool BuiltSuperNode = + !CurrSuperNode.empty() && TE->Idx == CurrSuperNode.getRoot()->Idx; + if (BuiltSuperNode) { + // This is rather ugly, but I don't see a cleaner way. + // The problem is that even after calling trySchedule(Bundle), + // 'Bundle' is not actually scheduled, only its successors are. This + // results in some of the nodes of a supernode (the ones closer to the + // root) being scheduled, while others not. Scheduled instructions + // have their predecessors dependence edges updated. This causes a + // problem after reordering, because some of these dependence counters + // may be updated twice. With scheduleReady() we force-schedule all + // the ready bundles (that is all of the bundles of the supernode) + // before we do the reordering, in order to avoid this double + // increment of the dependence counters. + // Another alternative is to unschedule and reschedule the nodes. + BS.scheduleReady(); + + // Reordering across the whole superonde. + CurrSuperNode.reorderOperands(this); + + // Create a temporary copy of the CurrSuperNode for looping + // through its operands after it has been cleared. + SuperNode TmpSuperNode = CurrSuperNode; + // We must clear the CurrSuperNode at this point because the + // recursion should continue without any active supernode. + CurrSuperNode.clear(); + // Resume the recursion towards the operands of the supernode. + for (unsigned OpIdx = 0, NumOperands = TmpSuperNode.getNumOperands(); + OpIdx != NumOperands; ++OpIdx) { + const ValueList OpVL = TmpSuperNode.getOperands().getVL(OpIdx); + buildTree_rec(OpVL, Depth + 1, TmpSuperNode.getEdge(OpIdx)); + } + } +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -2811,26 +3183,33 @@ ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); - // Sort operands of the instructions so that each side is more likely to - // have the same opcode. - if (isa(VL0) && VL0->isCommutative()) { - ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); - TE->setOperand(0, Left); - TE->setOperand(1, Right); - buildTree_rec(Left, Depth + 1, {TE, 0}); - buildTree_rec(Right, Depth + 1, {TE, 1}); - return; - } + if (EnableSuperNode && + // We are either in progress, or we can create a new one. + (!CurrSuperNode.empty() || CurrSuperNode.canInit(VL))) + buildSuperNode_rec(VL, TE, Depth, UserTreeIdx, BS); + // The default recursion. + else { + // Sort operands of the instructions so that each side is more likely to + // have the same opcode. + if (isa(VL0) && VL0->isCommutative()) { + ValueList Left, Right; + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + TE->setOperand(0, Left); + TE->setOperand(1, Right); + buildTree_rec(Left, Depth + 1, {TE, 0}); + buildTree_rec(Right, Depth + 1, {TE, 1}); + return; + } - TE->setOperandsInOrder(); - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + TE->setOperandsInOrder(); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (Value *j : VL) + Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, {TE, i}); + buildTree_rec(Operands, Depth + 1, {TE, i}); + } } return; } @@ -3054,25 +3433,33 @@ ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); - // Reorder operands if reordering would enable vectorization. - if (isa(VL0)) { - ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); - TE->setOperand(0, Left); - TE->setOperand(1, Right); - buildTree_rec(Left, Depth + 1, {TE, 0}); - buildTree_rec(Right, Depth + 1, {TE, 1}); - return; - } + if (EnableSuperNode && + // We are either in progress, or we can create a new one. + (!CurrSuperNode.empty() || CurrSuperNode.canInit(VL))) + buildSuperNode_rec(VL, TE, Depth, UserTreeIdx, BS); + // The default recursion. + else { + // Reorder operands if reordering would enable vectorization. + if (isa(VL0)) { + ValueList Left, Right; + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + TE->setOperand(0, Left); + TE->setOperand(1, Right); + buildTree_rec(Left, Depth + 1, {TE, 0}); + buildTree_rec(Right, Depth + 1, {TE, 1}); + return; + } - TE->setOperandsInOrder(); - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (Value *V : VL) - Operands.push_back(cast(V)->getOperand(i)); + TE->setOperandsInOrder(); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (Value *V : VL) + Operands.push_back(cast(V)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, {TE, i}); + buildTree_rec(Operands, Depth + 1, {TE, i}); + } + return; } return; } @@ -4879,6 +5266,14 @@ return Bundle; } +void BoUpSLP::BlockScheduling::scheduleReady() { + while (!ReadyInsts.empty()) { + ScheduleData *pickedSD = ReadyInsts.pop_back_val(); + if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) + schedule(pickedSD, ReadyInsts); + } +} + void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, Value *OpValue) { if (isa(OpValue)) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll @@ -15,18 +15,14 @@ ; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 -; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 -; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* ; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 -; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]] -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>* +; ENABLED-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP5]] ; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* ; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; ENABLED-NEXT: ret void @@ -74,18 +70,14 @@ ; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 -; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 -; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* ; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 -; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>* +; ENABLED-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; ENABLED-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP5]] ; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* ; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; ENABLED-NEXT: ret void @@ -134,18 +126,20 @@ ; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, double* [[CARRAY]], i64 1 ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 -; ENABLED-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 -; ENABLED-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 -; ENABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 -; ENABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; ENABLED-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 -; ENABLED-NEXT: [[C1:%.*]] = load double, double* [[IDXC1]], align 8 -; ENABLED-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]] -; ENABLED-NEXT: [[ADDB1C1:%.*]] = fadd fast double [[B1]], [[C1]] -; ENABLED-NEXT: [[SUB0:%.*]] = fsub fast double [[SUBA0B0]], [[C0]] -; ENABLED-NEXT: [[ADD1:%.*]] = fadd fast double [[ADDB1C1]], [[A1]] -; ENABLED-NEXT: store double [[SUB0]], double* [[IDXS0]], align 8 -; ENABLED-NEXT: store double [[ADD1]], double* [[IDXS1]], align 8 +; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXC0]] to <2 x double>* +; ENABLED-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; ENABLED-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> +; ENABLED-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP5]] +; ENABLED-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP5]] +; ENABLED-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x i32> +; ENABLED-NEXT: [[TMP12:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; ENABLED-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 ; ENABLED-NEXT: ret void ; entry: @@ -216,17 +210,15 @@ ; ENABLED-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[SARRAY:%.*]], i64 0 ; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[SARRAY]], i64 1 ; ENABLED-NEXT: [[C:%.*]] = load double, double* [[IDXC]], align 8 -; ENABLED-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; ENABLED-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* ; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; ENABLED-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0 -; ENABLED-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B1]], i32 1 -; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP2:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; ENABLED-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; ENABLED-NEXT: [[D:%.*]] = load double, double* [[IDXD]], align 8 -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0 -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[D]], i32 1 -; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] +; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0 +; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[D]], i32 1 +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP5]] +; ENABLED-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP3]] ; ENABLED-NEXT: [[TMP8:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* ; ENABLED-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; ENABLED-NEXT: ret void