Index: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -585,8 +585,7 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); - NumLoadsWantToKeepOrder = 0; - NumLoadsWantToChangeOrder = 0; + NumOpsWantToKeepOrder.clear(); for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -601,7 +600,12 @@ /// \returns true if it is beneficial to reverse the vector order. bool shouldReorder() const { - return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; + return std::accumulate( + NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), 0, + [](int Val1, + const decltype(NumOpsWantToKeepOrder)::value_type &Val2) { + return Val1 + (Val2.second < 0 ? 1 : -1); + }) > 0; } /// \return The vector element size in bits to use when vectorizing the @@ -1201,11 +1205,10 @@ /// List of users to ignore during scheduling and that don't need extracting. ArrayRef UserIgnoreList; - // Number of load bundles that contain consecutive loads. - int NumLoadsWantToKeepOrder = 0; - - // Number of load bundles that contain consecutive loads in reversed order. - int NumLoadsWantToChangeOrder = 0; + /// Number of operation bundles that contain consecutive operations - number + /// of operation bundles that contain consecutive operations in reversed + /// order. + DenseMap NumOpsWantToKeepOrder; // Analysis and block reference. Function *F; @@ -1543,7 +1546,11 @@ bool Reuse = canReuseExtract(VL, VL0); if (Reuse) { DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); + ++NumOpsWantToKeepOrder[S.Opcode]; } else { + SmallVector ReverseVL(VL.rbegin(), VL.rend()); + if (canReuseExtract(ReverseVL, VL0)) + --NumOpsWantToKeepOrder[S.Opcode]; BS.cancelScheduling(VL, VL0); } newTreeEntry(VL, Reuse, UserTreeIdx); @@ -1593,7 +1600,7 @@ } if (Consecutive) { - ++NumLoadsWantToKeepOrder; + ++NumOpsWantToKeepOrder[S.Opcode]; newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; @@ -1612,7 +1619,7 @@ newTreeEntry(VL, false, UserTreeIdx); if (ReverseConsecutive) { - ++NumLoadsWantToChangeOrder; + --NumOpsWantToKeepOrder[S.Opcode]; DEBUG(dbgs() << "SLP: Gathering reversed loads.\n"); } else { DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll @@ -5,13 +5,12 @@ ; CHECK-LABEL: @dotf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = fmul fast <4 x float> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP1]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret float [[TMP1]] ; entry: %vecext = extractelement <4 x float> %x, i32 0 @@ -38,13 +37,12 @@ ; CHECK-NEXT: [[X:%.*]] = load <4 x double>, <4 x double>* [[TMP0:%.*]], align 32 ; CHECK-NEXT: [[Y:%.*]] = load <4 x double>, <4 x double>* [[TMP1:%.*]], align 32 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> [[X]], [[Y]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret double [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret double [[TMP3]] ; entry: %x = load <4 x double>, <4 x double>* %0, align 32 @@ -73,13 +71,12 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[Y:%.*]], align 16 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP2]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret float [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret float [[TMP3]] ; entry: %0 = load <4 x float>, <4 x float>* %x, align 16 @@ -108,13 +105,12 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, <4 x double>* [[X:%.*]], align 32 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[Y:%.*]], align 32 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0 -; CHECK-NEXT: ret double [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret double [[TMP3]] ; entry: %0 = load <4 x double>, <4 x double>* %x, align 32