diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -536,24 +536,26 @@ } static InstructionsState getSameOpcode(ArrayRef VL, + const TargetLibraryInfo &TLI, unsigned BaseIndex = 0); /// Checks if the provided operands of 2 cmp instructions are compatible, i.e. /// compatible instructions or constants, or just some other regular values. static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, - Value *Op1) { + Value *Op1, const TargetLibraryInfo &TLI) { return (isConstant(BaseOp0) && isConstant(Op0)) || (isConstant(BaseOp1) && isConstant(Op1)) || (!isa(BaseOp0) && !isa(Op0) && !isa(BaseOp1) && !isa(Op1)) || - getSameOpcode({BaseOp0, Op0}).getOpcode() || - getSameOpcode({BaseOp1, Op1}).getOpcode(); + getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() || + getSameOpcode({BaseOp1, Op1}, TLI).getOpcode(); } /// \returns true if a compare instruction \p CI has similar "look" and /// same predicate as \p BaseCI, "as is" or with its operands and predicate /// swapped, false otherwise. -static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI) { +static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, + const TargetLibraryInfo &TLI) { assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() && "Assessing comparisons of different types?"); CmpInst::Predicate BasePred = BaseCI->getPredicate(); @@ -566,15 +568,16 @@ Value *Op1 = CI->getOperand(1); return (BasePred == Pred && - areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1)) || + areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) || (BasePred == SwappedPred && - areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0)); + areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI)); } /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef VL, + const TargetLibraryInfo &TLI, unsigned BaseIndex) { // Make sure these are all Instructions. if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) @@ -592,9 +595,19 @@ // Check for one alternate opcode from another BinaryOperator. // TODO - generalize to support all operators (types, calls etc.). + auto *IBase = cast(VL[BaseIndex]); + Intrinsic::ID BaseID = 0; + SmallVector BaseMappings; + if (auto *CallBase = dyn_cast(IBase)) { + BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI); + BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase); + if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty()) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { - unsigned InstOpcode = cast(VL[Cnt])->getOpcode(); - if (IsBinOp && isa(VL[Cnt])) { + auto *I = cast(VL[Cnt]); + unsigned InstOpcode = I->getOpcode(); + if (IsBinOp && isa(I)) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && @@ -603,9 +616,11 @@ AltIndex = Cnt; continue; } - } else if (IsCastOp && isa(VL[Cnt])) { - Type *Ty0 = cast(VL[BaseIndex])->getOperand(0)->getType(); - Type *Ty1 = cast(VL[Cnt])->getOperand(0)->getType(); + } else if (IsCastOp && isa(I)) { + Value *Op0 = IBase->getOperand(0); + Type *Ty0 = Op0->getType(); + Value *Op1 = I->getOperand(0); + Type *Ty1 = Op1->getType(); if (Ty0 == Ty1) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; @@ -634,11 +649,11 @@ (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) continue; - if (isCmpSameOrSwapped(BaseInst, Inst)) + if (isCmpSameOrSwapped(BaseInst, Inst, TLI)) continue; auto *AltInst = cast(VL[AltIndex]); if (AltIndex != BaseIndex) { - if (isCmpSameOrSwapped(AltInst, Inst)) + if (isCmpSameOrSwapped(AltInst, Inst, TLI)) continue; } else if (BasePred != CurrentPred) { assert( @@ -652,8 +667,45 @@ AltPred == CurrentPred || AltPred == SwappedCurrentPred) continue; } - } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) + } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) { + if (auto *Gep = dyn_cast(I)) { + if (Gep->getNumOperands() != 2 || + Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType()) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } else if (auto *EI = dyn_cast(I)) { + if (!isVectorLikeInstWithConstOps(EI)) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } else if (auto *LI = dyn_cast(I)) { + auto *BaseLI = cast(IBase); + if (!LI->isSimple() || !BaseLI->isSimple()) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } else if (auto *Call = dyn_cast(I)) { + auto *CallBase = cast(IBase); + if (Call->getCalledFunction() != CallBase->getCalledFunction()) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + if (Call->hasOperandBundles() && + !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(), + Call->op_begin() + Call->getBundleOperandsEndIndex(), + CallBase->op_begin() + + CallBase->getBundleOperandsStartIndex())) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI); + if (ID != BaseID) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + if (!ID) { + SmallVector Mappings = VFDatabase(*Call).getMappings(*Call); + if (Mappings.size() != BaseMappings.size() || + Mappings.front().ISA != BaseMappings.front().ISA || + Mappings.front().ScalarName != BaseMappings.front().ScalarName || + Mappings.front().VectorName != BaseMappings.front().VectorName || + Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || + Mappings.front().Shape.Parameters != + BaseMappings.front().Shape.Parameters) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } + } continue; + } return InstructionsState(VL[BaseIndex], nullptr, nullptr); } @@ -1097,6 +1149,7 @@ /// A helper class used for scoring candidates for two consecutive lanes. class LookAheadHeuristics { + const TargetLibraryInfo &TLI; const DataLayout &DL; ScalarEvolution &SE; const BoUpSLP &R; @@ -1104,9 +1157,11 @@ int MaxLevel; // The maximum recursion depth for accumulating score. public: - LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE, - const BoUpSLP &R, int NumLanes, int MaxLevel) - : DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {} + LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, + ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, + int MaxLevel) + : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes), + MaxLevel(MaxLevel) {} // The hard-coded scores listed here are not very important, though it shall // be higher for better matches to improve the resulting cost. When @@ -1128,6 +1183,8 @@ static const int ScoreSplatLoads = 3; /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). static const int ScoreReversedLoads = 3; + /// A load candidate for masked gather. + static const int ScoreMaskedGatherCandidate = 1; /// ExtractElementInst from same vector and consecutive indexes. static const int ScoreConsecutiveExtracts = 4; /// ExtractElementInst from same vector and reversed indices. @@ -1182,18 +1239,26 @@ auto *LI1 = dyn_cast(V1); auto *LI2 = dyn_cast(V2); if (LI1 && LI2) { - if (LI1->getParent() != LI2->getParent()) + if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() || + !LI2->isSimple()) return LookAheadHeuristics::ScoreFail; Optional Dist = getPointersDiff( LI1->getType(), LI1->getPointerOperand(), LI2->getType(), LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); - if (!Dist || *Dist == 0) + if (!Dist || *Dist == 0) { + if (getUnderlyingObject(LI1->getPointerOperand()) == + getUnderlyingObject(LI2->getPointerOperand()) && + R.TTI->isLegalMaskedGather( + FixedVectorType::get(LI1->getType(), NumLanes), + LI1->getAlign())) + return LookAheadHeuristics::ScoreMaskedGatherCandidate; return LookAheadHeuristics::ScoreFail; + } // The distance is too large - still may be profitable to use masked // loads/gathers. if (std::abs(*Dist) > NumLanes / 2) - return LookAheadHeuristics::ScoreAltOpcodes; + return LookAheadHeuristics::ScoreMaskedGatherCandidate; // This still will detect consecutive loads, but we might have "holes" // in some cases. It is ok for non-power-2 vectorization and may produce // better results. It should not affect current vectorization. @@ -1250,7 +1315,7 @@ SmallVector Ops(MainAltOps.begin(), MainAltOps.end()); Ops.push_back(I1); Ops.push_back(I2); - InstructionsState S = getSameOpcode(Ops); + InstructionsState S = getSameOpcode(Ops, TLI); // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. if (S.getOpcode() && @@ -1426,6 +1491,7 @@ /// A vector of operand vectors. SmallVector OpsVec; + const TargetLibraryInfo &TLI; const DataLayout &DL; ScalarEvolution &SE; const BoUpSLP &R; @@ -1527,7 +1593,7 @@ int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef MainAltOps, int Lane, unsigned OpIdx, unsigned Idx, bool &IsUsed) { - LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(), + LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(), LookAheadMaxDepth); // Keep track of the instruction stack as we recurse into the operands // during the look-ahead score exploration. @@ -1749,7 +1815,7 @@ // Use Boyer-Moore majority voting for finding the majority opcode and // the number of times it occurs. if (auto *I = dyn_cast(OpData.V)) { - if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() || + if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() || I->getParent() != Parent) { if (NumOpsWithSameOpcodeParent == 0) { NumOpsWithSameOpcodeParent = 1; @@ -1851,9 +1917,9 @@ public: /// Initialize with all the operands of the instruction vector \p RootVL. - VLOperands(ArrayRef RootVL, const DataLayout &DL, - ScalarEvolution &SE, const BoUpSLP &R) - : DL(DL), SE(SE), R(R) { + VLOperands(ArrayRef RootVL, const TargetLibraryInfo &TLI, + const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) + : TLI(TLI), DL(DL), SE(SE), R(R) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -1994,7 +2060,7 @@ if (MainAltOps[OpIdx].size() != 2) { OperandData &AltOp = getData(OpIdx, Lane); InstructionsState OpS = - getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}); + getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI); if (OpS.getOpcode() && OpS.isAltShuffle()) MainAltOps[OpIdx].push_back(AltOp.V); } @@ -2069,7 +2135,7 @@ Optional findBestRootPair(ArrayRef> Candidates, int Limit = LookAheadHeuristics::ScoreFail) { - LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2, + LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2, RootLookAheadMaxDepth); int BestScore = Limit; Optional Index; @@ -2244,12 +2310,10 @@ /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. - static void reorderInputsAccordingToOpcode(ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right, - const DataLayout &DL, - ScalarEvolution &SE, - const BoUpSLP &R); + static void reorderInputsAccordingToOpcode( + ArrayRef VL, SmallVectorImpl &Left, + SmallVectorImpl &Right, const TargetLibraryInfo &TLI, + const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R); /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the /// users of \p TE and collects the stores. It returns the map from the store @@ -2607,7 +2671,7 @@ return UndefValue::get(VL.front()->getType()); return VL[Idx]; }); - InstructionsState S = getSameOpcode(Last->Scalars); + InstructionsState S = getSameOpcode(Last->Scalars, *TLI); Last->setOperations(S); Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } @@ -3540,6 +3604,7 @@ } // anonymous namespace static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, + const TargetLibraryInfo &TLI, bool CompareOpcodes = true) { if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2)) return false; @@ -3553,7 +3618,7 @@ ((isConstant(GEP1->getOperand(1)) && isConstant(GEP2->getOperand(1))) || !CompareOpcodes || - getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}) + getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI) .getOpcode()); } @@ -3562,7 +3627,7 @@ static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, - LoopInfo &LI, + LoopInfo &LI, const TargetLibraryInfo &TLI, SmallVectorImpl &Order, SmallVectorImpl &PointerOps) { // Check that a vectorized load would load the same memory as a scalar @@ -3592,8 +3657,8 @@ Order.clear(); // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order); - if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) { - return arePointersCompatible(P, PointerOps.front()); + if (IsSorted || all_of(PointerOps, [&](Value *P) { + return arePointersCompatible(P, PointerOps.front(), TLI); })) { if (IsSorted) { Value *Ptr0; @@ -4715,7 +4780,8 @@ /// the given \p MainOp and \p AltOp instructions. static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, - const Instruction *AltOp); + const Instruction *AltOp, + const TargetLibraryInfo &TLI); void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { @@ -4761,7 +4827,7 @@ return true; }; - InstructionsState S = getSameOpcode(VL); + InstructionsState S = getSameOpcode(VL, *TLI); // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of // a load), in which case peek through to include it in the tree, without @@ -4969,7 +5035,7 @@ // Reset S to make it GetElementPtr kind of node. const auto *It = find_if(VL, [](Value *V) { return isa(V); }); assert(It != VL.end() && "Expected at least one GEP."); - S = getSameOpcode(*It); + S = getSameOpcode(*It, *TLI); } // Check that all of the users of the scalars that we want to vectorize are @@ -5173,8 +5239,8 @@ SmallVector PointerOps; OrdersType CurrentOrder; TreeEntry *TE = nullptr; - switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder, - PointerOps)) { + switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, + CurrentOrder, PointerOps)) { case LoadsState::Vectorize: if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. @@ -5285,7 +5351,7 @@ // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { @@ -5332,7 +5398,7 @@ // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -5643,7 +5709,8 @@ if (!CI || all_of(VL, [](Value *V) { return cast(V)->isCommutative(); })) { - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(VL, Left, Right, *TLI, *DL, *SE, + *this); } else { auto *MainCI = cast(S.MainOp); auto *AltCI = cast(S.AltOp); @@ -5658,7 +5725,7 @@ Value *LHS = Cmp->getOperand(0); Value *RHS = Cmp->getOperand(1); - if (isAlternateInstruction(Cmp, MainCI, AltCI)) { + if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) { if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate())) std::swap(LHS, RHS); } else { @@ -5952,16 +6019,17 @@ static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, - const Instruction *AltOp) { + const Instruction *AltOp, + const TargetLibraryInfo &TLI) { if (auto *MainCI = dyn_cast(MainOp)) { auto *AltCI = cast(AltOp); CmpInst::Predicate MainP = MainCI->getPredicate(); CmpInst::Predicate AltP = AltCI->getPredicate(); assert(MainP != AltP && "Expected different main/alternate predicates."); auto *CI = cast(I); - if (isCmpSameOrSwapped(MainCI, CI)) + if (isCmpSameOrSwapped(MainCI, CI, TLI)) return false; - if (isCmpSameOrSwapped(AltCI, CI)) + if (isCmpSameOrSwapped(AltCI, CI, TLI)) return true; CmpInst::Predicate P = CI->getPredicate(); CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P); @@ -6231,7 +6299,7 @@ OrdersType CurrentOrder; LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI, - CurrentOrder, PointerOps); + *TLI, CurrentOrder, PointerOps); switch (LS) { case LoadsState::Vectorize: case LoadsState::ScatterVectorize: @@ -7666,15 +7734,13 @@ // Perform operand reordering on the instructions in VL and return the reordered // operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right, - const DataLayout &DL, - ScalarEvolution &SE, - const BoUpSLP &R) { +void BoUpSLP::reorderInputsAccordingToOpcode( + ArrayRef VL, SmallVectorImpl &Left, + SmallVectorImpl &Right, const TargetLibraryInfo &TLI, + const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) { if (VL.empty()) return; - VLOperands Ops(VL, DL, SE, R); + VLOperands Ops(VL, TLI, DL, SE, R); // Reorder the operands in place. Ops.reorder(); Left = Ops.getVL(0); @@ -7967,13 +8033,13 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { const unsigned VF = VL.size(); - InstructionsState S = getSameOpcode(VL); + InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) { const auto *It = find_if(VL, [](Value *V) { return isa(V); }); if (It != VL.end()) - S = getSameOpcode(*It); + S = getSameOpcode(*It, *TLI); } if (S.getOpcode()) { if (TreeEntry *E = getTreeEntry(S.OpValue)) @@ -8691,9 +8757,10 @@ SmallVector Mask; buildShuffleEntryMask( E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, - [E](Instruction *I) { + [E, this](Instruction *I) { assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - return isAlternateInstruction(I, E->getMainOp(), E->getAltOp()); + return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), + *TLI); }, Mask, &OpScalars, &AltScalars); @@ -10607,7 +10674,7 @@ // Check that all of the parts are instructions of the same type, // we permit an alternate opcode via InstructionsState. - InstructionsState S = getSameOpcode(VL); + InstructionsState S = getSameOpcode(VL, *TLI); if (!S.getOpcode()) return false; @@ -11252,7 +11319,7 @@ } for (LoadInst *RLI : LIt->second) { if (arePointersCompatible(RLI->getPointerOperand(), - LI->getPointerOperand())) { + LI->getPointerOperand(), TLI)) { hash_code SubKey = hash_value(RLI->getPointerOperand()); DoNotReverseVals.insert(RLI); return SubKey; @@ -11295,7 +11362,7 @@ } for (LoadInst *RLI : LIt->second) { if (arePointersCompatible(RLI->getPointerOperand(), - LI->getPointerOperand())) { + LI->getPointerOperand(), TLI)) { hash_code SubKey = hash_value(RLI->getPointerOperand()); DoNotReverseVals.insert(RLI); return SubKey; @@ -11366,7 +11433,8 @@ } /// Attempt to vectorize the tree found by matchAssociativeReduction. - Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { + Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, + const TargetLibraryInfo &TLI) { constexpr int ReductionLimit = 4; constexpr unsigned RegMaxNumber = 4; constexpr unsigned RedValsMaxNumber = 128; @@ -11439,7 +11507,7 @@ // Try to vectorize elements based on their type. for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { ArrayRef OrigReducedVals = ReducedVals[I]; - InstructionsState S = getSameOpcode(OrigReducedVals); + InstructionsState S = getSameOpcode(OrigReducedVals, TLI); SmallVector Candidates; Candidates.reserve(2 * OrigReducedVals.size()); DenseMap TrackedToOrig(2 * OrigReducedVals.size()); @@ -11459,7 +11527,7 @@ // Try to handle shuffled extractelements. if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() && I + 1 < E) { - InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]); + InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI); if (NextS.getOpcode() == Instruction::ExtractElement && !NextS.isAltShuffle()) { SmallVector CommonCandidates(Candidates); @@ -12125,7 +12193,7 @@ if (IsBinop || IsSelect) { HorizontalReduction HorRdx; if (HorRdx.matchAssociativeReduction(P, Inst, *SE, *DL, *TLI)) - return HorRdx.tryToReduce(R, TTI); + return HorRdx.tryToReduce(R, TTI, *TLI); } return nullptr; }; @@ -12318,7 +12386,7 @@ /// predicate of the second or the operands IDs are less than the operands IDs /// of the second cmp instruction. template -static bool compareCmp(Value *V, Value *V2, +static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, function_ref IsDeleted) { auto *CI1 = cast(V); auto *CI2 = cast(V2); @@ -12354,7 +12422,7 @@ if (auto *I2 = dyn_cast(Op2)) { if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}); + InstructionsState S = getSameOpcode({I1, I2}, TLI); if (S.getOpcode()) continue; return false; @@ -12408,15 +12476,15 @@ } // Try to vectorize list of compares. // Sort by type, compare predicate, etc. - auto &&CompareSorter = [&R](Value *V, Value *V2) { - return compareCmp(V, V2, + auto CompareSorter = [&](Value *V, Value *V2) { + return compareCmp(V, V2, *TLI, [&R](Instruction *I) { return R.isDeleted(I); }); }; - auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) { + auto AreCompatibleCompares = [&](Value *V1, Value *V2) { if (V1 == V2) return true; - return compareCmp(V1, V2, + return compareCmp(V1, V2, *TLI, [&R](Instruction *I) { return R.isDeleted(I); }); }; auto Limit = [&R](Value *V) { @@ -12498,7 +12566,7 @@ "Different nodes should have different DFS numbers"); if (NodeI1 != NodeI2) return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); - InstructionsState S = getSameOpcode({I1, I2}); + InstructionsState S = getSameOpcode({I1, I2}, *TLI); if (S.getOpcode()) continue; return I1->getOpcode() < I2->getOpcode(); @@ -12515,7 +12583,7 @@ } return ConstOrder && *ConstOrder; }; - auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { + auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) { if (V1 == V2) return true; if (V1->getType() != V2->getType()) @@ -12532,7 +12600,7 @@ if (auto *I2 = dyn_cast(Opcodes2[I])) { if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}); + InstructionsState S = getSameOpcode({I1, I2}, *TLI); if (S.getOpcode()) continue; return false; @@ -12835,7 +12903,7 @@ "Different nodes should have different DFS numbers"); if (NodeI1 != NodeI2) return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); - InstructionsState S = getSameOpcode({I1, I2}); + InstructionsState S = getSameOpcode({I1, I2}, *TLI); if (S.getOpcode()) return false; return I1->getOpcode() < I2->getOpcode(); @@ -12847,7 +12915,7 @@ V2->getValueOperand()->getValueID(); }; - auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) { + auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) { if (V1 == V2) return true; if (V1->getPointerOperandType() != V2->getPointerOperandType()) @@ -12860,7 +12928,7 @@ if (auto *I2 = dyn_cast(V2->getValueOperand())) { if (I1->getParent() != I2->getParent()) return false; - InstructionsState S = getSameOpcode({I1, I2}); + InstructionsState S = getSameOpcode({I1, I2}, *TLI); return S.getOpcode() > 0; } if (isa(V1->getValueOperand()) && diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll @@ -62,11 +62,11 @@ ; Don't vectorize volatile loads. define void @test_volatile_load(double* %a, double* %b, double* %c) { ; CHECK-LABEL: @test_volatile_load( -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 -; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A]], align 8 -; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B]], align 8 +; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8 +; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 ; CHECK-NEXT: [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 ; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[I0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -17,22 +17,22 @@ ; CHECK-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src, i32 0, i64 5), align 8 ; CHECK-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src, i32 0, i64 6), align 8 ; CHECK-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src, i32 0, i64 7), align 8 -; CHECK-NEXT: [[SIN0:%.*]] = call fast double @llvm.sin.f64(double [[A2]]) -; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]]) -; CHECK-NEXT: [[SQRT0:%.*]] = call fast double @llvm.sqrt.f64(double [[A0]]) -; CHECK-NEXT: [[SQRT1:%.*]] = call fast double @llvm.sqrt.f64(double [[A1]]) -; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]]) -; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]]) -; CHECK-NEXT: [[SQRT2:%.*]] = call fast double @llvm.sqrt.f64(double [[A4]]) -; CHECK-NEXT: [[SQRT3:%.*]] = call fast double @llvm.sqrt.f64(double [[A5]]) -; CHECK-NEXT: [[RES1:%.*]] = fadd fast double [[SQRT0]], [[SIN1]] -; CHECK-NEXT: [[RES2:%.*]] = fadd fast double [[SIN0]], [[SQRT1]] -; CHECK-NEXT: [[RES00:%.*]] = fadd fast double [[RES1]], [[RES2]] -; CHECK-NEXT: [[RES3:%.*]] = fadd fast double [[SQRT2]], [[SIN3]] -; CHECK-NEXT: [[RES4:%.*]] = fadd fast double [[SIN2]], [[SQRT3]] -; CHECK-NEXT: [[RES01:%.*]] = fadd fast double [[RES3]], [[RES4]] -; CHECK-NEXT: store double [[RES00]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst, i32 0, i64 0), align 8 -; CHECK-NEXT: store double [[RES01]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst, i32 0, i64 1), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]] +; CHECK-NEXT: store <2 x double> [[TMP15]], <2 x double>* bitcast ([8 x double]* @dst to <2 x double>*), align 8 ; CHECK-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src, i32 0, i64 0), align 8