Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -302,9 +302,8 @@ TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE) - : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), - SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), - DL(DL), ORE(ORE), Builder(Se->getContext()) { + : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), + DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); // Use the vector register size specified by the target unless overridden // by a command-line option. @@ -357,8 +356,7 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); - NumLoadsWantToKeepOrder = 0; - NumLoadsWantToChangeOrder = 0; + NumOpsWantToKeepOrder.clear(); for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -373,7 +371,9 @@ /// \returns true if it is beneficial to reverse the vector order. bool shouldReorder() const { - return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; + return llvm::any_of( + NumOpsWantToKeepOrder, + [](const std::pair &Pair) { return Pair.second < 0; }); } /// \return The vector element size in bits to use when vectorizing the @@ -920,11 +920,10 @@ /// List of users to ignore during scheduling and that don't need extracting. ArrayRef UserIgnoreList; - // Number of load bundles that contain consecutive loads. - int NumLoadsWantToKeepOrder; - - // Number of load bundles that contain consecutive loads in reversed order. - int NumLoadsWantToChangeOrder; + /// Number of operation bundles that contain consecutive operations - number + /// of operation bundles that contain consecutive operations in reversed + /// order. + DenseMap NumOpsWantToKeepOrder; // Analysis and block reference. Function *F; @@ -1268,7 +1267,11 @@ bool Reuse = canReuseExtract(VL, Opcode); if (Reuse) { DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); + ++NumOpsWantToKeepOrder[Opcode]; } else { + SmallVector ReverseVL(VL.rbegin(), VL.rend()); + if (canReuseExtract(ReverseVL, Opcode)) + --NumOpsWantToKeepOrder[Opcode]; BS.cancelScheduling(VL); } newTreeEntry(VL, Reuse, UserTreeIdx); @@ -1319,7 +1322,7 @@ } if (Consecutive) { - ++NumLoadsWantToKeepOrder; + ++NumOpsWantToKeepOrder[Opcode]; newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; @@ -1338,7 +1341,7 @@ newTreeEntry(VL, false, UserTreeIdx); if (ReverseConsecutive) { - ++NumLoadsWantToChangeOrder; + --NumOpsWantToKeepOrder[Opcode]; DEBUG(dbgs() << "SLP: Gathering reversed loads.\n"); } else { DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); Index: test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll +++ test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll @@ -5,13 +5,12 @@ ; CHECK-LABEL: @dotf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = fmul fast <4 x float> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP1]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret float [[TMP1]] ; entry: %vecext = extractelement <4 x float> %x, i32 0 @@ -38,13 +37,12 @@ ; CHECK-NEXT: [[X:%.*]] = load <4 x double>, <4 x double>* [[TMP0:%.*]], align 32 ; CHECK-NEXT: [[Y:%.*]] = load <4 x double>, <4 x double>* [[TMP1:%.*]], align 32 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> [[X]], [[Y]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret double [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret double [[TMP3]] ; entry: %x = load <4 x double>, <4 x double>* %0, align 32 @@ -73,13 +71,12 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[Y:%.*]], align 16 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP2]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret float [[TMP3]] ; entry: %0 = load <4 x float>, <4 x float>* %x, align 16 @@ -108,13 +105,12 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, <4 x double>* [[X:%.*]], align 32 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[Y:%.*]], align 32 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret double [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret double [[TMP3]] ; entry: %0 = load <4 x double>, <4 x double>* %x, align 32