diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -70,9 +70,11 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/Support/FormattedStream.h" #ifdef EXPENSIVE_CHECKS #include "llvm/IR/Verifier.h" #endif +#include "llvm/IR/AssemblyAnnotationWriter.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -117,6 +119,10 @@ cl::desc("Only vectorize if you gain more than this " "number ")); +static cl::opt + SLPDumpIRWithCosts("slp-dump-ir-with-costs", cl::init(false), cl::Hidden, + cl::desc("Dump IR annotated with SLP tree costs")); + static cl::opt ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions")); @@ -2044,6 +2050,16 @@ ~BoUpSLP(); +#ifndef NDEBUG + llvm::Optional + getVectorizedInstrCost(const Instruction *I) const { + auto It = VectorizedInstrCostMap.find(I); + return It != VectorizedInstrCostMap.end() + ? llvm::Optional(It->second) + : None; + } +#endif + private: /// Check if the operands on the edges \p Edges of the \p UserTE allows /// reordering (i.e. the operands can be reordered because they have only one @@ -2246,6 +2262,9 @@ /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + /// The cost gain (vector - scalar) if we decide to vectorize this entry. + InstructionCost Cost; + /// The index of this treeEntry in VectorizableTree. int Idx = -1; @@ -2373,6 +2392,10 @@ return FoundLane; } + void setCost(InstructionCost Cost) { this->Cost = Cost; } + + InstructionCost getCost() const { return Cost; } + #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dump() const { @@ -2397,6 +2420,9 @@ dbgs() << "NeedToGather\n"; break; } + dbgs() << "Cost: " + << (Cost.getValue() ? std::to_string(*Cost.getValue()) : "-") + << "\n"; dbgs() << "MainOp: "; if (MainOp) dbgs() << *MainOp << "\n"; @@ -2549,6 +2575,12 @@ /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; +#ifndef NDEBUG + /// Helper map for holding the cost of vectorized instructions. Used only in + /// lit tests with -slp-dump-ir-with-costs. + SmallDenseMap VectorizedInstrCostMap; +#endif + /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser(Value *S, llvm::User *U, int L) @@ -6188,6 +6220,7 @@ TreeEntry &TE = *VectorizableTree[I]; InstructionCost C = getEntryCost(&TE, VectorizedVals); + TE.setCost(C); Cost += C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " << *TE.Scalars[0] @@ -6905,6 +6938,11 @@ ShuffleBuilder.addMask(ReuseShuffleIndicies); Vec = ShuffleBuilder.finalize(Vec); } +#ifndef NDEBUG + if (SLPDumpIRWithCosts) + if (Instruction *I = dyn_cast(Vec)) + VectorizedInstrCostMap[I] = getGatherCost(VL); +#endif return Vec; } @@ -7495,7 +7533,18 @@ Value *BoUpSLP::vectorizeTree() { ExtraValueToDebugLocsMap ExternallyUsedValues; - return vectorizeTree(ExternallyUsedValues); + Value *V = vectorizeTree(ExternallyUsedValues); +#ifndef NDEBUG + if (SLPDumpIRWithCosts) { + // Collect the cost of all vectorized instructions. + for (const auto &TE : VectorizableTree) { + if (Value *VectorizedValue = TE->VectorizedValue) + if (Instruction *I = dyn_cast(VectorizedValue)) + VectorizedInstrCostMap[I] = TE->getCost(); + } + } +#endif + return V; } Value * @@ -8830,6 +8879,24 @@ R.optimizeGatherSequence(); LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); } + + if (SLPDumpIRWithCosts) { + /// Helper class for annotating vector instructions with their entry cost. + class SLPAssemblyAnnotationWriter : public AssemblyAnnotationWriter { + BoUpSLP *R; + + public: + SLPAssemblyAnnotationWriter(BoUpSLP *R) : R(R) {} + void printInfoComment(const Value &V, + formatted_raw_ostream &OS) override { + if (const Instruction *I = dyn_cast(&V)) + if (Optional CostOpt = R->getVectorizedInstrCost(I)) + OS << " Cost:" << CostOpt->getValue(); + } + }; + SLPAssemblyAnnotationWriter SLPAAW(&R); + F.print(outs(), &SLPAAW, true, true); + } return Changed; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-threshold=-999 < %s | FileCheck %s +; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-threshold=-999 -disable-output -slp-dump-ir-with-costs < %s | FileCheck %s ; S[0] = %v1 + %v2 @@ -18,13 +18,24 @@ ; CHECK-NEXT: [[V1:%.*]] = sub i64 [[A0]], 1 ; CHECK-NEXT: [[V2:%.*]] = sub i64 [[B0]], 1 ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i64, i64* [[S:%.*]], i64 0 +; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 1 +; CHECK-NEXT: [[IDXS2:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 2 +; CHECK-NEXT: [[IDXS3:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 3 +; CHECK-NEXT: [[ADD3:%.*]] = add i64 [[V1]], [[V2]] ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer Cost:5 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[V2]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer Cost:5 +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], [[SHUFFLE1]] Cost:0 +; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[V2]], [[V1]] +; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[V2]], [[V1]] +; CHECK-NEXT: [[ADD0:%.*]] = add i64 [[V1]], [[V2]] +; CHECK-NEXT: store i64 [[ADD3]], i64* [[IDXS3]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[IDXS0]] to <4 x i64>* -; CHECK-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 8 Cost:-2 +; CHECK-NEXT: store i64 [[ADD2]], i64* [[IDXS2]], align 8 +; CHECK-NEXT: store i64 [[ADD1]], i64* [[IDXS1]], align 8 +; CHECK-NEXT: store i64 [[ADD0]], i64* [[IDXS0]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -65,20 +76,35 @@ ; CHECK-NEXT: [[A0:%.*]] = load i16, i16* [[A:%.*]], align 8 ; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32 ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0 +; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1 +; CHECK-NEXT: [[IDXS2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2 +; CHECK-NEXT: [[IDXS3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 3 ; CHECK-NEXT: [[B0:%.*]] = load i16, i16* [[B:%.*]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load i16, i16* [[C:%.*]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load i16, i16* [[D:%.*]], align 8 ; CHECK-NEXT: [[E0:%.*]] = load i16, i16* [[E:%.*]], align 8 +; CHECK-NEXT: [[V4:%.*]] = sext i16 [[D0]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B0]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[C0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[E0]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[D0]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[D0]], i32 3 Cost:4 +; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> Cost:1 +; CHECK-NEXT: [[V5:%.*]] = sext i16 [[E0]] to i32 +; CHECK-NEXT: [[V3:%.*]] = sext i16 [[C0]] to i32 +; CHECK-NEXT: [[V2:%.*]] = sext i16 [[B0]] to i32 +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[V1]], [[V4]] ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[V1]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP4]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer Cost:2 +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP4]] Cost:-3 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[V5]], [[V1]] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[V3]], [[V1]] +; CHECK-NEXT: [[ADD0:%.*]] = add i32 [[V1]], [[V2]] +; CHECK-NEXT: store i32 [[ADD3]], i32* [[IDXS3]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[IDXS0]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 8 +; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 8 Cost:-3 +; CHECK-NEXT: store i32 [[ADD2]], i32* [[IDXS2]], align 8 +; CHECK-NEXT: store i32 [[ADD1]], i32* [[IDXS1]], align 8 +; CHECK-NEXT: store i32 [[ADD0]], i32* [[IDXS0]], align 8 ; CHECK-NEXT: ret void ; entry: