Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -634,6 +634,420 @@ #endif }; + /// A helper data structure to hold the operands of a vector of instructions. + /// This supports a fixed vector length for all operand vectors. + class VLOperands { + /// For each operand we need (i) the value, and (ii) the opcode that it + /// would be attached to if the expression was in a left-linearized form. + /// This is required to avoid illegal operand reordering. + /// For example: + /// \verbatim + /// 0 Op1 + /// |/ + /// Op1 Op2 Linearized + Op2 + /// \ / ----------> |/ + /// - - + /// + /// Op1 - Op2 (0 + Op1) - Op2 + /// \endverbatim + /// + /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. + /// + /// Another way to think of this is to track all the operations across the + /// path from the operand all the way to the root of the tree and to + /// calculate the operation that corresponds to this path. For example, the + /// path from Op2 to the root crosses the RHS of the '-', therefore the + /// corresponding operation is a '-' (which matches the one in the + /// linearized tree, as shown above). + /// + /// For lack of a better term, we refer to this operation as Accumulated + /// Path Operation (APO). + struct OperandData { + OperandData() = default; + OperandData(Value *V, bool APO, bool IsUsed) + : V(V), APO(APO), IsUsed(IsUsed) {} + /// The operand value. + Value *V = nullptr; + /// TreeEntries only allow a single opcode, or an alternate sequence of + /// them (e.g, +, -). Therefore, we can safely use a boolean value for the + /// APO. It is set to 'true' if 'V' is attached to an inverse operation + /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise + /// (e.g., Add/Mul) + bool APO = false; + /// Helper data for the reordering function. + bool IsUsed = false; + }; + + /// During operand reordering, we are trying to select the operand at lane + /// that matches best with the operand at the neighboring lane. Our + /// selection is based on the type of value we are looking for. For example, + /// if the neighboring lane has a load, we need to look for a load that is + /// accessing a consecutive address. These strategies are summarized in the + /// 'ReorderingMode' enumerator. + enum class ReorderingMode { + Load, // Matching loads to consecutive memory addresses + Opcode, // Matching instructions based on opcode (same or alternate) + Constant, // Matching constants + Splat, // Matching the same instruction multiple times (broadcast) + Failed, // We failed to create a vectorizable group + }; + + using OperandDataVec = SmallVector; + + /// A vector of operand vectors. + SmallVector OpsVec; + + const DataLayout &DL; + ScalarEvolution &SE; + + /// \returns the operand data at \p OpIdx and \p Lane. + OperandData &getData(unsigned OpIdx, unsigned Lane) { + return OpsVec[OpIdx][Lane]; + } + + /// \returns the operand data at \p OpIdx and \p Lane. Const version. + const OperandData &getData(unsigned OpIdx, unsigned Lane) const { + return OpsVec[OpIdx][Lane]; + } + + /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. + void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { + std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); + } + + // Search all operands in Ops[*][Lane] for the one that matches best + // Ops[OpIdx][LastLane] and return its opreand index. + // If no good match can be found, return None. + Optional + getBestOperand(unsigned OpIdx, int Lane, int LastLane, + ArrayRef ReorderingModes) { + unsigned NumOperands = getNumOperands(); + + // The operand of the previous lane at OpIdx. + Value *OpLastLane = getData(OpIdx, LastLane).V; + + // Our strategy mode for OpIdx. + ReorderingMode RMode = ReorderingModes[OpIdx]; + + // The linearized opcode of the operand at OpIdx, Lane. + bool OpIdxAPO = getData(OpIdx, Lane).APO; + + const unsigned BestScore = 2; + const unsigned GoodScore = 1; + + // The best operand index and its score. + // Sometimes we have more than one option (e.g., Opcode and Undefs), so we + // are using the score to differentiate between the two. + struct BestOpData { + Optional Idx = None; + unsigned Score = 0; + } BestOp; + + // Iterate through all unused operands and look for the best. + for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { + // Get the operand at Idx and Lane. + OperandData &OpData = getData(Idx, Lane); + Value *Op = OpData.V; + bool OpAPO = OpData.APO; + + // Skip already selected operands. + if (OpData.IsUsed) + continue; + + // Skip if we are trying to move the operand to a position with a + // different opcode in the linearized tree form. This would break the + // semantics. + if (OpAPO != OpIdxAPO) + continue; + + // Look for an operand that matches the current mode. + switch (RMode) { + case ReorderingMode::Load: + if (isa(Op)) { + // Figure out which is left and right, so that we can check for + // consecutive loads + bool LeftToRight = Lane > LastLane; + Value *OpLeft = (LeftToRight) ? OpLastLane : Op; + Value *OpRight = (LeftToRight) ? Op : OpLastLane; + if (isConsecutiveAccess(cast(OpLeft), + cast(OpRight), DL, SE)) + BestOp.Idx = Idx; + } + break; + case ReorderingMode::Opcode: + // We accept both Instructions and Undefs, but with different scores. + if ((isa(Op) && + cast(Op)->getOpcode() == + cast(OpLastLane)->getOpcode()) || + isa(Op)) { + // An instruction has a higher score than an undef. + unsigned Score = (isa(Op)) ? GoodScore : BestScore; + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; + } + } + break; + case ReorderingMode::Constant: + if (isa(Op)) { + unsigned Score = (isa(Op)) ? GoodScore : BestScore; + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; + } + } + break; + case ReorderingMode::Splat: + if (Op == OpLastLane) + BestOp.Idx = Idx; + break; + case ReorderingMode::Failed: + return None; + } + } + + if (BestOp.Idx) { + getData(BestOp.Idx.getValue(), Lane).IsUsed = true; + return BestOp.Idx; + } + // If we could not find a good match return None. + return None; + } + + /// Helper for reorderOperandVecs. \Returns the lane that we should start + /// reordering from. This is the one which has the least number of operands + /// that can freely move about. + unsigned getBestLaneToStartReordering() const { + unsigned BestLane = 0; + unsigned Min = UINT_MAX; + for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; + ++Lane) { + unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane); + if (NumFreeOps < Min) { + Min = NumFreeOps; + BestLane = Lane; + } + } + return BestLane; + } + + /// \Returns the maximum number of operands that are allowed to be reordered + /// for \p Lane. This is used as a heuristic for selecting the first lane to + /// start operand reordering. + unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { + unsigned CntTrue = 0; + unsigned NumOperands = getNumOperands(); + // Operands with the same APO can be reordered. We therefore need to count + // how many of them we have for each APO, like this: Cnt[APO] = x. + // Since we only have two APOs, namely true and false, we can avoid using + // a map. Instead we can simply count the number of operands that + // correspond to one of them (in this case the 'true' APO), and calculate + // the other by subtracting it from the total number of operands. + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) + if (getData(OpIdx, Lane).APO) + CntTrue++; + unsigned CntFalse = NumOperands - CntTrue; + return std::max(CntTrue, CntFalse); + } + + /// Go through the instructions in VL and append their operands. + void appendOperandsOfVL(ArrayRef VL) { + assert(!VL.empty() && "Bad VL"); + assert((empty() || VL.size() == getNumLanes()) && + "Expected same number of lanes"); + assert(isa(VL[0]) && "Expected instruction"); + unsigned NumOperands = cast(VL[0])->getNumOperands(); + OpsVec.resize(NumOperands); + unsigned NumLanes = VL.size(); + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + OpsVec[OpIdx].resize(NumLanes); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + assert(isa(VL[Lane]) && "Expected instruction"); + // Our tree has just 3 nodes: the root and two operands. + // It is therefore trivial to get the APO. We only need to check the + // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or + // RHS operand. The LHS operand of both add and sub is never attached + // to an inversese operation in the linearized form, therefore its APO + // is false. The RHS is ture only if VL[Lane] is an inverse operation. + + // Since operand reordering is performed on groups of commutative + // operations or alternating sequences (e.g., +, -), we can safely + // tell the inverse operations by checking commutativity. + bool IsInverseOperation = !isCommutative(cast(VL[Lane])); + bool APO = (OpIdx == 0) ? false : IsInverseOperation; + OpsVec[OpIdx][Lane] = {cast(VL[Lane])->getOperand(OpIdx), + APO, false}; + } + } + } + + /// \returns the number of operands. + unsigned getNumOperands() const { return OpsVec.size(); } + + /// \returns the number of lanes. + unsigned getNumLanes() const { return OpsVec[0].size(); } + + /// \returns the operand value at \p OpIdx and \p Lane. + Value *getValue(unsigned OpIdx, unsigned Lane) const { + return getData(OpIdx, Lane).V; + } + + /// \returns true if the data structure is empty. + bool empty() const { return OpsVec.empty(); } + + /// Clears the data. + void clear() { OpsVec.clear(); } + + public: + VLOperands() = default; + /// Initialize with all the operands of the instruction vector \p VL. + VLOperands(ArrayRef VL, const DataLayout &DL, ScalarEvolution &SE) + : DL(DL), SE(SE) { + // Append all the operands of VL. + appendOperandsOfVL(VL); + } + + /// \Returns a value vector with the operands across all lanes for the + /// opearnd at \p OpIdx. + ValueList getVL(unsigned OpIdx) const { + ValueList OpVL; + OpVL.reserve(OpsVec[OpIdx].size()); + for (const OperandData &OpData : OpsVec[OpIdx]) + OpVL.push_back(OpData.V); + return OpVL; + } + + // Performs operand reordering for 2 or more operands. + // The original operands are in OrigOps[OpIdx][Lane]. + // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. + void reorder() { + unsigned NumOperands = getNumOperands(); + unsigned NumLanes = getNumLanes(); + // Each operand has its own mode. We are using this mode to help us select + // the instructions for each lane, so that they match best with the ones + // we have selected so far. + SmallVector ReorderingModes(NumOperands); + + // This is a greedy single-pass algorithm. We are going over each lane + // once and deciding on the best order right away with no back-tracking. + // However, in order to increase its effectiveness, we start with the lane + // that has operands that can move the least. For example, given the + // following lanes: + // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd + // Lane 1 : A[1] = C[1] - B[1] // Visited 1st + // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd + // Lane 3 : A[3] = C[3] - B[3] // Visited 4th + // we will start at Lane 1, since the operands of the subtraction cannot + // be reordered. Then we will visit the rest of the lanes in a circular + // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. + + // Find the first lane that we will start our search from. + unsigned FirstLane = getBestLaneToStartReordering(); + + // Initialize the modes. + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + Value *OpLane0 = getValue(OpIdx, FirstLane); + // Keep track if we have instructions with all the same opcode on one + // side. + if (isa(OpLane0)) + ReorderingModes[OpIdx] = ReorderingMode::Load; + else if (isa(OpLane0)) + ReorderingModes[OpIdx] = ReorderingMode::Opcode; + else if (isa(OpLane0)) + ReorderingModes[OpIdx] = ReorderingMode::Constant; + else if (isa(OpLane0)) + // Our best hope is a Splat. It may save some cost in some cases. + ReorderingModes[OpIdx] = ReorderingMode::Splat; + else + llvm_unreachable("Bad Operand"); + } + + // We keep the original operand order for the FirstLane, so reorder the + // rest of the lanes. We are visiting the nodes in a circular fashion, + // using FirstLane as the center point and increasing the radius distance. + for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { + // Visit the lane on the right and then the lane on the left. + for (int Direction : {+1, -1}) { + int Lane = FirstLane + Direction * Distance; + if (Lane < 0 || Lane >= (int)NumLanes) + continue; + int LastLane = Lane - Direction; + assert(LastLane >= 0 && LastLane < (int)NumLanes && "Out of bounds"); + // Look for a good match for each operand. + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + // Search for the operand that matches SortedOps[OpIdx][Lane-1]. + Optional BestIdx = + getBestOperand(OpIdx, Lane, LastLane, ReorderingModes); + // By not selecting a value, we allow the operands that follow to + // select a better matching value. We will get a non-null value in + // the next run of getBestOperand(). + if (BestIdx) + // Swap the current operand with the one returned by + // getBestOperand(). + swap(OpIdx, BestIdx.getValue(), Lane); + else + // If we failed to find a best operand, then set the mode to + // 'Failed'. + ReorderingModes[OpIdx] = ReorderingMode::Failed; + } + } + } + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + LLVM_DUMP_METHOD static const char *getModeStr(ReorderingMode RMode) { + switch (RMode) { + case ReorderingMode::Load: + return "Load"; + case ReorderingMode::Opcode: + return "Opcode"; + case ReorderingMode::Constant: + return "Constant"; + case ReorderingMode::Splat: + return "Splat"; + case ReorderingMode::Failed: + return "Failed"; + } + llvm_unreachable("Unimplemented Reordering Type"); + } + + LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode, + raw_ostream &OS) { + return OS << getModeStr(RMode); + } + + /// Debug print. + LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) { + printMode(RMode, dbgs()); + } + + friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { + return printMode(RMode, OS); + } + + LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const { + const unsigned Indent = 2; + unsigned Cnt = 0; + for (const auto &OpDataVec : OpsVec) { + OS << "Operand " << Cnt++ << "\n"; + for (const auto &OpData : OpDataVec) { + OS.indent(Indent) << "{"; + if (Value *V = OpData.V) + OS << *V; + else + OS << "null"; + OS << ", APO:" << OpData.APO << "}\n"; + } + OS << "\n"; + } + return OS; + } + + /// Debug print. + LLVM_DUMP_METHOD void dump() const { print(dbgs()); } +#endif + }; + private: struct TreeEntry; @@ -681,10 +1095,12 @@ /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree() const; - /// \reorder commutative operands to get better probability of + /// Reorder the operands in \p Ops to improve vectorization. + void reorderOperandVecs(VLOperands &Ops) const; + + /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. - void reorderInputsAccordingToOpcode(const InstructionsState &S, - ArrayRef VL, + void reorderInputsAccordingToOpcode(ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) const; struct TreeEntry { @@ -1866,7 +2282,7 @@ // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(S, VL, Left, Right); + reorderInputsAccordingToOpcode(VL, Left, Right); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { @@ -1912,7 +2328,7 @@ // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(S, VL, Left, Right); + reorderInputsAccordingToOpcode(VL, Left, Right); UserTreeIdx.EdgeIdx = 0; buildTree_rec(Left, Depth + 1, UserTreeIdx); UserTreeIdx.EdgeIdx = 1; @@ -2087,7 +2503,7 @@ // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderInputsAccordingToOpcode(S, VL, Left, Right); + reorderInputsAccordingToOpcode(VL, Left, Right); UserTreeIdx.EdgeIdx = 0; buildTree_rec(Left, Depth + 1, UserTreeIdx); UserTreeIdx.EdgeIdx = 1; @@ -2802,171 +3218,17 @@ return getGatherCost(VecTy, ShuffledElements); } -// Return true if the i'th left and right operands can be commuted. -// -// The vectorizer is trying to either have all elements one side being -// instruction with the same opcode to enable further vectorization, or having -// a splat to lower the vectorizing cost. -static bool shouldReorderOperands(int i, ArrayRef Left, - ArrayRef Right, - bool AllSameOpcodeLeft, - bool AllSameOpcodeRight, bool SplatLeft, - bool SplatRight) { - Value *PrevLeft = Left[i - 1]; - Value *PrevRight = Right[i - 1]; - Value *CurrLeft = Left[i]; - Value *CurrRight = Right[i]; - - // If we have "SplatRight", try to see if commuting is needed to preserve it. - if (SplatRight) { - if (CurrRight == PrevRight) - // Preserve SplatRight - return false; - if (CurrLeft == PrevRight) { - // Commuting would preserve SplatRight, but we don't want to break - // SplatLeft either, i.e. preserve the original order if possible. - // (FIXME: why do we care?) - if (SplatLeft && CurrLeft == PrevLeft) - return false; - return true; - } - } - // Symmetrically handle Right side. - if (SplatLeft) { - if (CurrLeft == PrevLeft) - // Preserve SplatLeft - return false; - if (CurrRight == PrevLeft) - return true; - } - - Instruction *ILeft = dyn_cast(CurrLeft); - Instruction *IRight = dyn_cast(CurrRight); - - // If we have "AllSameOpcodeRight", try to see if the left operands preserves - // it and not the right, in this case we want to commute. - if (AllSameOpcodeRight) { - unsigned RightPrevOpcode = cast(PrevRight)->getOpcode(); - if (IRight && RightPrevOpcode == IRight->getOpcode()) - // Do not commute, a match on the right preserves AllSameOpcodeRight - return false; - if (ILeft && RightPrevOpcode == ILeft->getOpcode()) { - // We have a match and may want to commute, but first check if there is - // not also a match on the existing operands on the Left to preserve - // AllSameOpcodeLeft, i.e. preserve the original order if possible. - // (FIXME: why do we care?) - if (AllSameOpcodeLeft && ILeft && - cast(PrevLeft)->getOpcode() == ILeft->getOpcode()) - return false; - return true; - } - } - // Symmetrically handle Left side. - if (AllSameOpcodeLeft) { - unsigned LeftPrevOpcode = cast(PrevLeft)->getOpcode(); - if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) - return false; - if (IRight && LeftPrevOpcode == IRight->getOpcode()) - return true; - } - return false; -} - -void BoUpSLP::reorderInputsAccordingToOpcode( - const InstructionsState &S, ArrayRef VL, - SmallVectorImpl &Left, SmallVectorImpl &Right) const { - assert(!VL.empty() && Left.empty() && Right.empty() && - "Unexpected instruction/operand lists"); - - // Push left and right operands of binary operation into Left and Right - for (Value *V : VL) { - auto *I = cast(V); - assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); - } - - // Keep track if we have instructions with all the same opcode on one side. - bool AllSameOpcodeLeft = isa(Left[0]); - bool AllSameOpcodeRight = isa(Right[0]); - // Keep track if we have one side with all the same value (broadcast). - bool SplatLeft = true; - bool SplatRight = true; - - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - Instruction *I = cast(VL[i]); - // Commute to favor either a splat or maximizing having the same opcodes on - // one side. - if (isCommutative(I) && - shouldReorderOperands(i, Left, Right, AllSameOpcodeLeft, - AllSameOpcodeRight, SplatLeft, SplatRight)) - std::swap(Left[i], Right[i]); - - // Update Splat* and AllSameOpcode* after the insertion. - SplatRight = SplatRight && (Right[i - 1] == Right[i]); - SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); - AllSameOpcodeLeft = AllSameOpcodeLeft && isa(Left[i]) && - (cast(Left[i - 1])->getOpcode() == - cast(Left[i])->getOpcode()); - AllSameOpcodeRight = AllSameOpcodeRight && isa(Right[i]) && - (cast(Right[i - 1])->getOpcode() == - cast(Right[i])->getOpcode()); - } - - // If one operand end up being broadcast, return this operand order. - if (SplatRight || SplatLeft) +// Perform operand reordering on the instructions in VL. +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, + SmallVectorImpl &Left, + SmallVectorImpl &Right) const { + if (VL.empty()) return; - - // Finally check if we can get longer vectorizable chain by reordering - // without breaking the good operand order detected above. - // E.g. If we have something like- - // load a[0] - load b[0] - // load b[1] + load a[1] - // load a[2] - load b[2] - // load a[3] + load b[3] - // Reordering the second load b[1] + load a[1] would allow us to vectorize - // this code and we still retain AllSameOpcode property. - // FIXME: This load reordering might break AllSameOpcode in some rare cases - // such as- - // add a[0],c[0] load b[0] - // add a[1],c[2] load b[1] - // b[2] load b[2] - // add a[3],c[3] load b[3] - for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) { - if (LoadInst *L = dyn_cast(Left[j])) { - if (LoadInst *L1 = dyn_cast(Right[j + 1])) { - if (isConsecutiveAccess(L, L1, *DL, *SE)) { - auto *VL1 = cast(VL[j]); - auto *VL2 = cast(VL[j + 1]); - if (isCommutative(VL2)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - if (isCommutative(VL1)) { - std::swap(Left[j], Right[j]); - continue; - } - } - } - } - if (LoadInst *L = dyn_cast(Right[j])) { - if (LoadInst *L1 = dyn_cast(Left[j + 1])) { - if (isConsecutiveAccess(L, L1, *DL, *SE)) { - auto *VL1 = cast(VL[j]); - auto *VL2 = cast(VL[j + 1]); - if (isCommutative(VL2)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - if (isCommutative(VL1)) { - std::swap(Left[j], Right[j]); - continue; - } - } - } - } - // else unchanged - } + VLOperands Ops(VL, *DL, *SE); + // Reorder the operands in place. + Ops.reorder(); + Left = Ops.getVL(0); + Right = Ops.getVL(1); } void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL, Index: test/Transforms/SLPVectorizer/X86/alternate-int.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -536,13 +536,11 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) { ; CHECK-LABEL: @add_sub_v8i32_splat( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]] +; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 Index: test/Transforms/SLPVectorizer/X86/crash_lencod.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_lencod.ll +++ test/Transforms/SLPVectorizer/X86/crash_lencod.ll @@ -126,14 +126,15 @@ define fastcc void @dct36(double* %inbuf) { ; CHECK-LABEL: @dct36( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 2 -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX44]], align 8 -; CHECK-NEXT: [[ADD46:%.*]] = fadd double [[TMP0]], undef -; CHECK-NEXT: store double [[ADD46]], double* [[ARRAYIDX41]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[INBUF]], align 8 -; CHECK-NEXT: [[ADD49:%.*]] = fadd double [[TMP1]], [[TMP0]] -; CHECK-NEXT: store double [[ADD49]], double* [[ARRAYIDX44]], align 8 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[INBUF]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double undef, i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/operandorder.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/operandorder.ll +++ test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -28,19 +28,16 @@ ret void } -define void @shuffle_preserve_broadcast(double * noalias %from, double * noalias %to, double %v1, double %v2) { -; CHECK-LABEL: @shuffle_preserve_broadcast( +define void @vecload_vs_broadcast(double * noalias %from, double * noalias %to, double %v1, double %v2) { +; CHECK-LABEL: @vecload_vs_broadcast( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1 -; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4 -; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V0_2]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 @@ -67,20 +64,17 @@ ret void } -define void @shuffle_preserve_broadcast2(double * noalias %from, double * noalias %to, double %v1, double %v2) { -; CHECK-LABEL: @shuffle_preserve_broadcast2( +define void @vecload_vs_broadcast2(double * noalias %from, double * noalias %to, double %v1, double %v2) { +; CHECK-LABEL: @vecload_vs_broadcast2( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1 -; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4 -; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V0_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -106,20 +100,17 @@ ret void } -define void @shuffle_preserve_broadcast3(double * noalias %from, double * noalias %to, double %v1, double %v2) { -; CHECK-LABEL: @shuffle_preserve_broadcast3( +define void @vecload_vs_broadcast3(double * noalias %from, double * noalias %to, double %v1, double %v2) { +; CHECK-LABEL: @vecload_vs_broadcast3( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1 -; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4 -; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V0_2]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] @@ -184,22 +175,21 @@ ret void } -define void @shuffle_preserve_broadcast5(double * noalias %from, double * noalias %to, double %v1, double %v2) { -; CHECK-LABEL: @shuffle_preserve_broadcast5( +define void @vecload_vs_broadcast5(double * noalias %from, double * noalias %to, double %v1, double %v2) { +; CHECK-LABEL: @vecload_vs_broadcast5( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1 -; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4 -; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[P]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void